# Merchant Ranking Algorithm
The method of ranking each merchant, will take inspiration from the methods proposed in (https://sapinsider.org/leveraging-analytical-method-for-ranking-suppliers/), in which we scale each sub_attribute with min-max norminalisation, then sum them together to get the overall score for a particular Metric

The Key Attributes/Metrics are: 
- Finantial 
- Customer_Base 
- Sustainability

In [126]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import feature as H
from pyspark.sql import Window
from pyspark.sql.functions import coalesce, col

import re, sys, string
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [127]:
def open_spark():
    spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "6g")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate())
    return spark

In [95]:
# NOTE: This value can be changed to point to the directory where all the tables are stored. By default, it's the data folder in the generic-buy-now-pay-later directory.

dir = "../data/"

In [96]:
def feature_standardisation(dataset, max_columns, min_columns = []): 
    """
    Function for scaling every individual score
    """
    # if higher values are preferred
    for col_name in max_columns:
        values = dataset.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
        dataset = dataset.withColumn(col_name, F.round((F.col(col_name) - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]), 4))
    # if lower values are preferred
    if min_columns:
        for col_name in min_columns:
            values = dataset.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
            dataset = dataset.withColumn(col_name, F.round((values.select('high').head()[0] - F.col(col_name)) / (values.select('high').head()[0] - values.select('low').head()[0]), 4))
    return dataset

# 1. Financial Metric
For this we will look into:
- Total_Revenue
- Average_Revenue_Growth

These are both taken over the last 6 months

In [97]:
def revenue_growth_score(revenue_data):
    """
    Function to calculate revenue growth per merchant
    """
    revenue_data_table = revenue_data.select('merchant_abn').distinct()
    
    months = [3,4,5,6,7,8,9]
    for month in months:
        end_month = month + 1
        if end_month < 10:
            revenue_data_table = revenue_data_table.join(revenue_data.where((F.col('order_datetime') < F.lit('2022-0' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.sum('BNPL_weighted_Revenue').alias('Month_' + str(month))), on='merchant_abn', how='left')
        else:
            revenue_data_table = revenue_data_table.join(revenue_data.where((F.col('order_datetime') < F.lit('2022-' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.sum('BNPL_weighted_Revenue').alias('Month_' + str(month))), on='merchant_abn', how='left')
    revenue_data_table = revenue_data_table.fillna(1)

    # Now need to calculate growth rate for each month
    revenue_data_table = revenue_data_table.withColumn('Growth_4', (F.col('Month_4') - F.col('Month_3')) / F.col('Month_4'))
    revenue_data_table = revenue_data_table.withColumn('Growth_5', (F.col('Month_5') - F.col('Month_4')) / F.col('Month_5'))
    revenue_data_table = revenue_data_table.withColumn('Growth_6', (F.col('Month_6') - F.col('Month_5')) / F.col('Month_6'))
    revenue_data_table = revenue_data_table.withColumn('Growth_7', (F.col('Month_7') - F.col('Month_6'))/ F.col('Month_7'))
    revenue_data_table = revenue_data_table.withColumn('Growth_8', (F.col('Month_8') - F.col('Month_7')) / F.col('Month_8'))
    revenue_data_table = revenue_data_table.withColumn('Growth_9', (F.col('Month_9') - F.col('Month_8')) / F.col('Month_9'))

    revenue_data_table = revenue_data_table.withColumn('Revenue_Growth_Avg', F.round((F.col('Growth_4') + F.col('Growth_5') + F.col('Growth_6') + F.col('Growth_7') + F.col('Growth_8') + F.col('Growth_9')) / 6 , 4))
    revenue_data_table = revenue_data_table.select('merchant_abn', 'Revenue_Growth_Avg')
    return revenue_data_table

In [128]:
def financial_score(RECENCY, transactions, fraud_data):
    """
    Function to score each merchant on their financial aspects
    """
    revenue_data = transactions.where(F.col('order_datetime') > RECENCY).select('merchant_abn','user_id', 'order_datetime', 'dollar_value', 'BNPL_Revenue')
    
    revenue_data = revenue_data.join(fraud_data, on=['merchant_abn', 'user_id', 'order_datetime'], how='left')
    # Firstly, calculate the revenue score and weight the revenue with probability
    revenue_data = revenue_data.withColumn('BNPL_weighted_Revenue', F.col('BNPL_Revenue') * (1 - 0.1 * F.col('prediction')))

    # Now can calculate each Merchant's revenue
    revenue_final = revenue_data.groupBy('merchant_abn').agg(F.round(F.sum('BNPL_weighted_Revenue'), 2).alias('Total_Revenue'))

    # Next, get the revenue growth score 
    revenue_final = revenue_final.join(revenue_growth_score(revenue_data), on='merchant_abn', how='left')

    # Finally scale
    revenue_final = feature_standardisation(revenue_final, ['Revenue_Growth_Avg', 'Total_Revenue'])
    
    revenue_final.write.parquet('../' + dir + 'curated/Metric_Finantial_scaled', mode ='overwrite')

# Sustainability 
- Experience
- Customer Growth (Last 6 months)
- Postcode_Entropy
- Industry score (added when scoring each metric [next notebook])

In [98]:
def customer_growth(full_dataset):
    """
    Function to calculate customer growth for each merchant
    """
    Unique_cust = full_dataset.where(F.col('order_datetime') < F.lit('2022-05-01')).groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Unique_Customers'))

    # now for the past 6 monts calculate 
    months = [5,6,7,8,9,10]
    customer_counting_data = full_dataset.select('merchant_abn', 'user_id', 'order_datetime', 'month')
    for month in months:
        end_month = month + 1
        if end_month < 10:
            Unique_cust = Unique_cust.join(customer_counting_data.where((F.col('order_datetime') < F.lit('2022-0' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Month_' + str(month))), on='merchant_abn', how='left')
        else:
            Unique_cust = Unique_cust.join(customer_counting_data.where((F.col('order_datetime') < F.lit('2022-' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Month_' + str(month))), on='merchant_abn', how='left')
            
    Unique_cust = Unique_cust.fillna(1)

    # Now need to calculate growth rate for each month
    Unique_cust = Unique_cust.withColumn('Growth_5', F.col('Unique_Customers') / F.col('Month_5'))
    Unique_cust = Unique_cust.withColumn('Growth_6', (F.col('Month_6') - F.col('Month_5')) / F.col('Month_6'))
    Unique_cust = Unique_cust.withColumn('Growth_7', (F.col('Month_7') - F.col('Month_6'))/ F.col('Month_7'))
    Unique_cust = Unique_cust.withColumn('Growth_8', (F.col('Month_8') - F.col('Month_7')) / F.col('Month_8'))
    Unique_cust = Unique_cust.withColumn('Growth_9', (F.col('Month_9') - F.col('Month_8')) / F.col('Month_9'))
    Unique_cust = Unique_cust.withColumn('Growth_10',(F.col('Month_10') - F.col('Month_9')) / F.col('Month_10'))

    # Finally get the average customer growth rate over the last 6 months
    Unique_cust = Unique_cust.withColumn('Customer_Growth_Avg', F.round((F.col('Growth_5') + F.col('Growth_6') + F.col('Growth_7') + F.col('Growth_8') + F.col('Growth_9') + F.col('Growth_10')) / 6 , 4))
    Unique_cust = Unique_cust.withColumn('Customer_Growth_Avg', F.when(F.col('Customer_Growth_Avg').isNull(), 0).otherwise(F.col('Customer_Growth_Avg')))
    return Unique_cust.select('merchant_abn', 'Customer_Growth_Avg')

In [129]:
def sustainability_score(merchants, full_dataset):
    merchants_sub = merchants.select(['merchant_abn', 'postcode_entropy'])
    merchants_sub = merchants_sub.join(customer_growth(full_dataset), on='merchant_abn', how='left')

    # calculate experience
    number_of_dates_by_merchant = full_dataset.groupBy('merchant_abn','order_datetime').count()
    number_of_dates_by_merchant = number_of_dates_by_merchant.drop('order_datetime')
    number_of_dates_by_merchant = number_of_dates_by_merchant.groupBy('merchant_abn').sum('count')
    
    merchants_sub = merchants_sub.join(number_of_dates_by_merchant.withColumnRenamed('sum(count)','Total_Business_Days'), on='merchant_abn', how='left')
    merchants_final = feature_standardisation(merchants_sub, ['Total_Business_Days', 'Customer_Growth_Avg'], ['postcode_entropy'])
    
    merchants_final.write.parquet(dir + 'curated/Metric_Sustainability_scaled', mode = 'overwrite')

# Customer Base
In this sections we use the features:
- Transaction_per_User
- customer_wealth
- Unique_Customers
- Loyal_AVG
- postcode_entropy

In [99]:
def loyalty(full_dataset):
    """
    Function to create loyalty feature for customer base score calculation
    """
    grouped = full_dataset.groupBy("user_id", "merchant_abn")
    RPR = grouped.count().withColumnRenamed("count", "RPR")
    upSell = RPR.groupBy("user_id").count().withColumnRenamed("count", "upsell")
    CLV = grouped.sum("dollar_value").withColumnRenamed("sum(dollar_value)", "CLV")

    # Define the window
    w = Window.partitionBy(["user_id", "merchant_abn"]).orderBy("order_datetime")

    retention = full_dataset.withColumn(
        'diff',
        F.datediff(F.col("order_datetime"), F.lag("order_datetime").over(w))
    ).groupBy("user_id", "merchant_abn").agg(F.avg(F.col("diff")).alias("retention"))

    loyal = retention.na.fill(value=365).join(RPR, on=["user_id", "merchant_abn"], how="left").join(CLV, on=["user_id", "merchant_abn"], how="left").join(upSell, on=["user_id"], how="left")

    loyal = loyal.withColumn("loyal", F.col("RPR") * F.col("CLV") * F.col("upSell") / F.col("retention"))

    loyal = loyal.select(*(col(c).cast("float").alias(c) for c in loyal.columns))

    loyal = loyal.select('user_id', 'merchant_abn', 'loyal')

    magic_percentile = F.expr('percentile_approx(loyal, 0.5)')
    loyal = loyal.join(loyal.groupBy('user_id').agg(magic_percentile.alias('med_val')), on='user_id', how='left')
    loyal = loyal.withColumn('loyal', coalesce('loyal', 'med_val')).select(['user_id', 'merchant_abn', 'loyal'])
    loyal_agg = loyal.groupBy('merchant_abn').agg(F.round(F.avg('loyal'), 2).alias('Loyal_AVG'))
    
    return loyal_agg

In [100]:
def customer_entropy(full_dataset):
    # Get customer Entropy 
    cust_entro = full_dataset.select('merchant_abn', 'user_id', 'postcode', 'order_datetime')
    Entropy_cust = cust_entro.select('merchant_abn').toDF('merchant_abn').drop_duplicates().crossJoin(cust_entro.select('postcode').toDF('postcode').drop_duplicates())
    Entropy_cust = Entropy_cust.join(cust_entro.groupBy('merchant_abn', 'postcode').agg(F.countDistinct('user_id').alias('Count')), on=['merchant_abn', 'postcode'], how='left')
    Entropy_cust = Entropy_cust.fillna(1)
    Entropy_cust = Entropy_cust.join(Entropy_cust.groupBy("merchant_abn").sum('Count'), on='merchant_abn', how='left')
    Entropy_cust = Entropy_cust.withColumn('Probability', F.col('Count') / F.col('sum(Count)'))
    Entropy_cust = Entropy_cust.groupBy("merchant_abn").agg((-F.sum(F.col("Probability") * F.log2(col("Probability")))).alias("Customer_Entropy"))
    return Entropy_cust

In [101]:
def customer_score(full_dataset, customers):
    full_dataset = full_dataset.join(customers, on='user_id')
    # add some of the attributes
    cust_data = full_dataset.groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Unique_Customers'), F.round(F.count('user_id') / F.countDistinct('user_id'), 2).alias('Transaction_per_User'), F.round(F.avg('Average taxable income or loss'),2).alias('customer_wealth'))
    # add the loyalty score
    cust_data = cust_data.join(loyalty(full_dataset), on='merchant_abn')
    cust_data = cust_data.join(customer_entropy(full_dataset), on='merchant_abn', how='left')
    cust_final = feature_standardisation(cust_data, ['Transaction_per_User', 'customer_wealth', 'Unique_Customers', 'Loyal_AVG'], ['Customer_Entropy'])
    # save file 
    cust_final.write.parquet(dir + 'curated/Metric_Customer_scaled', mode = 'overwrite')

# Industry/Environment 
We add a general score, based on their description tags, in relation to which industry they belong to, including:
- Market Dominance
- Survival Rate
- Fraud_Prob_Avg
- Customer_Base
- Growth

In [102]:
def description_transformation(spark):
    Administrative_Support_Services= ["equipment tool furniture appliance rent al leasing",
                                      "florist supply nursery stock flower",
                                      "lawn garden supply outlet including nursery"]
    Personal_Services= ["shoe shop",
                        "gift card novelty souvenir shop",
                        "antique shop sale repair restoration service",
                        "watch clock jewelry repair shop",
                        "jewelry watch clock silverware shop",
                        "motor vehicle supply new part",
                        "furniture home furnishing equipment shop manufacturer except appliance",
                        "tent awning shop",
                        "optician optical good eyeglass"]
    Arts_Recreation_Services = ["digital good book movie music",
                                "music shop musical instrument piano sheet music",
                                "health beauty spa",
                                "bicycle shop sale service",
                                "art dealer gallery",
                                "hobby toy game shop",
                                "stationery office supply printing writing paper"]
    Information_Media_Telecommunications = ["telecom",
                                            "computer programming data processing integrated system design service",
                                            "book periodical newspaper",
                                            "artist supply craft shop",
                                            "computer computer peripheral equipment software",
                                            "cable satellite pay television radio service"]
    desc = []
    tags = []
    for ele in Administrative_Support_Services:
        desc.append("Administrative_Support_Services")
    tags += Administrative_Support_Services
    for ele in Personal_Services:
        desc.append("Personal_Services")
    tags += Personal_Services
    for ele in Arts_Recreation_Services:
        desc.append("Arts_Recreation_Services")
    tags += Arts_Recreation_Services
    for ele in Information_Media_Telecommunications:
        desc.append("Information_Media_Telecommunications")
    tags += Information_Media_Telecommunications

    to_df = {
        "sector" :  desc,
        "Description" : tags
    }
    sectors = spark.createDataFrame(pd.DataFrame(to_df))
    # save sectors
    return sectors

In [120]:
def industry(full_dataset, fraud_probabilities, spark, RECENCY):
    desc_lookup = spark.read.option("inferSchema", True).option("header", True).csv(dir + "/tables/description_lookup.csv")
    industry_data = full_dataset.join(desc_lookup, )
    industry_data = full_dataset.select('merchant_abn','user_id', 'order_datetime', 'dollar_value', 'BNPL_Revenue', 'Description')
    # Now join fraud prob 
    industry_data = industry_data.join(fraud_probabilities, on=['merchant_abn', 'user_id', 'order_datetime'], how='left')
    industry_data = industry_data.withColumn('BNPL_weighted_Revenue', F.col('BNPL_Revenue') * (1 - 0.1 * F.col('prediction'))
                                                                 )
    dataset_sector = description_transformation(spark)
    industry_data = industry_data.join(dataset_sector, on=['Description'], how='inner').drop('Description')
    Merchant_Industry = industry_data.select('merchant_abn', 'sector').distinct()
    # Market Dominance
    industry_table = industry_data.where(F.col('order_datetime') > RECENCY).groupBy('sector').agg(F.sum('BNPL_weighted_Revenue').alias('Total_Weighted_Revenue'), F.count('BNPL_weighted_Revenue').alias('Total_Transactions'), F.avg('BNPL_weighted_Revenue').alias('Average_Weighted_Revenue'))
    total_rev = industry_table.groupBy().sum().collect()[0][0]
    total_transa = industry_table.groupBy().sum().collect()[0][0]
    industry_table = industry_table.withColumn('Total_Weighted_Revenue', F.col('Total_Weighted_Revenue') / total_rev).withColumnRenamed('Total_Weighted_Revenue', 'Portion_of_Total_Revenue')
    industry_table = industry_table.withColumn('Total_Transactions', F.col('Total_Transactions') / total_transa).withColumnRenamed('Total_Transactions', 'Portion_of_Total_Transactions')
    # Survival Rate - data/tables/sector_average.parquet
    sector_info = spark.read.parquet(dir + 'tables/sector_information.parquet')
    industry_table = industry_table.join(sector_info.select(F.col('sector'), F.col('survival_rate').cast('double').alias("survival_rate")), on='sector', how='left')
    industry_table = feature_standardisation(industry_table, ['Portion_of_Total_Revenue', 'Portion_of_Total_Transactions', 'survival_rate', 'Average_Weighted_Revenue'])
    # ensure assigned to each merchant
    industry_table = Merchant_Industry.join(industry_table, on='sector', how='left')
    # save file 
    industry_table.write.parquet(dir + 'curated/Metric_industry_scaled', mode = 'overwrite')

In [104]:
# This is the main function, which can run everything

spark = open_spark()
customers = spark.read.option("inferSchema", True).parquet(dir + 'processed/customers/')
transactions = spark.read.option("inferSchema", True).parquet(dir + 'processed/transactions')
merchants = spark.read.option("inferSchema", True).parquet(dir + "processed/merchants")
fraud_probabilities = spark.read.parquet(dir + '../models/random_forest_output_full/')

In [105]:
desc_lookup = spark.read.option("header", False).csv(dir + "/tables/description_lookup.csv")

In [106]:
desc_lookup

_c0,_c1
furniture home fu...,0
cable satellite p...,1
jewelry watch clo...,2
watch clock jewel...,3
music shop musica...,4
gift card novelty...,5
computer computer...,6
computer programm...,7
equipment tool fu...,8
artist supply cra...,9


In [107]:
final_data_collection = merchants.select('merchant_abn')
fraud_probabilities = fraud_probabilities.select('merchant_abn', 'user_id', 'order_datetime', 'prediction')


In [108]:

# APPLY PREVIOUSELY SHARED & UNIMPLEMENTED FUNCTIONS
merchants = merchants.join(desc_lookup, (merchants.tags==desc_lookup._c1)).drop('tags')
merchants = merchants.withColumnRenamed('_c0', 'Description').drop('_c1')
full_dataset = transactions.join(merchants, on='merchant_abn', how='inner')
# calculate the BNPL unweighted revenue 
full_dataset = full_dataset.withColumn('BNPL_Revenue', F.col('dollar_value') * F.col('BNPL_Fee'))
# First calculate the finantial score

# We will for now determine as last 6 months (i.e march) 
RECENCY =  F.lit('2022-03-01')

In [109]:
merchants.limit(3)

merchant_abn,name,Earnings_Class,BNPL_Fee,avg_monthly_inc,monthly_entropy,postcode_entropy,revenue,Description
10023283211,Felis Limited,e,0.18,-0.0952381,2.9858725,7.439226,703277.8708539009,furniture home fu...
10142254217,Arcu Ac Orci Corp...,b,4.22,0.0,2.9779751,7.418948,118355.94002620316,cable satellite p...
10165489824,Nunc Sed Company,b,4.4,3.0,1.609438,1.609438,56180.48120117188,jewelry watch clo...


In [110]:
# Now run each score, which will save results to file
# Note: customer score takes much linger, and sould probably run seperate
financial_score(RECENCY, full_dataset, fraud_probabilities)

                                                                                

In [111]:
fin_score = spark.read.parquet(dir + '/curated/Metric_Finantial_scaled')
print(fin_score.count())
fin_score.limit(3)

4008


merchant_abn,Total_Revenue,Revenue_Growth_Avg
12516851436,0.0036,0.55
15613631617,0.0027,0.4736
19839532017,0.008,0.4683


In [112]:
sustainability_score(merchants, full_dataset)

                                                                                

In [113]:
sust_score = spark.read.parquet(dir + '/curated/Metric_Sustainability_scaled')
print(sust_score.count())
sust_score.limit(3)

3858


merchant_abn,postcode_entropy,Customer_Growth_Avg,Total_Business_Days
10255988167,0.1851,0.4386,0.0029
10430380319,0.3681,0.469,0.0005
10618089367,0.0633,0.4418,0.0118


In [114]:
customer_score(full_dataset, customers)

                                                                                

In [115]:
cust_score = spark.read.parquet(dir + '/curated/Metric_Customer_scaled')
print(cust_score.count())
cust_score.limit(3)

3858


merchant_abn,Unique_Customers,Transaction_per_User,customer_wealth,Loyal_AVG,Customer_Entropy
10255988167,0.0338,0.0018,0.4196,0.0186,0.0842
10430380319,0.0064,0.0,0.4133,0.006,0.0
10618089367,0.1325,0.0064,0.4244,0.026,0.6354


In [121]:
industry(full_dataset, fraud_probabilities, spark, RECENCY)

                                                                                

In [122]:
ind_score = spark.read.parquet(dir + '/curated/Metric_industry_scaled')
print(ind_score.count())
ind_score.limit(3)

3858


sector,merchant_abn,Portion_of_Total_Revenue,Portion_of_Total_Transactions,Average_Weighted_Revenue,survival_rate
Administrative_Su...,11633090957,0.0,0.0,0.9449,0.0
Administrative_Su...,22718657980,0.0,0.0,0.9449,0.0
Administrative_Su...,68112267199,0.0,0.0,0.9449,0.0
