# Merchant Ranking Algorithm
The method of ranking each merchant, will take inspiration from the methods proposed in (https://sapinsider.org/leveraging-analytical-method-for-ranking-suppliers/), in which we rank each key attribute out of 5, then sum them together with weightsget a score for each merchant. 

The Key Attributes for now are: 
- Merchant's BNPL Revenue
- Proportion of 'identified' Fraud transactions
- The customer Base 
- Projected Growth

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import feature as H
import sys
# First lets read the datasets
spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/10/06 05:26:55 WARN Utils: Your hostname, James-N580VD-DM229T resolves to a loopback address: 127.0.1.1; using 172.30.148.210 instead (on interface eth0)
22/10/06 05:26:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 05:26:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


1. Tansaction_Revenue
For this we will take:
- Total_BNPL_Revenue
- Average_BNPL_Revenue

In [2]:
dir = sys.argv[4] + '/'
transactions = spark.read.parquet(dir + 'processed/transactions')
merchants =  spark.read.parquet(dir + 'processed/merchants/')
full_dataset = spark.read.parquet(dir + 'curated/full_dataset/')
final_data_collection = merchants.select('merchant_abn')

                                                                                

In [3]:
final_data_collection = final_data_collection.join(full_dataset.where(F.col('Potential_outlier') == False).groupBy('merchant_abn').agg(F.round(F.sum('BNPL_Revenue'), 2).alias('Total_BNPL_Revenue'), F.round(F.sum('dollar_value'), 2).alias('Total_Dollar_value'), F.count('BNPL_Revenue').alias('Total_Transactions')), on=['merchant_abn'])
# now add last 6 months
final_data_collection = final_data_collection.join(full_dataset.where((F.col('order_datetime') > F.lit('2022-03-01')) & (F.col('Potential_outlier') == False)).groupBy('merchant_abn').agg(F.round(F.sum('BNPL_Revenue'), 2).alias('Total_BNPL_Revenue_6MON'), F.round(F.sum('dollar_value'), 2).alias('Total_Dollar_value_6MON'), F.count('BNPL_Revenue').alias('Total_Transactions_6MON')), on=['merchant_abn'])
# now add fraud attributes
final_data_collection = final_data_collection.join(full_dataset.where(F.col('Potential_outlier') == True).groupBy('merchant_abn').agg(F.round(F.sum('BNPL_Revenue'), 2).alias('Total_BNPL_Revenue_Fraud'), F.count('BNPL_Revenue').alias('Total_Transactions_Fraud'), F.round(F.sum('dollar_value'), 2).alias('Total_Dollar_value_Fraud')), on=['merchant_abn'])
final_data_collection = final_data_collection.join(full_dataset.where((F.col('order_datetime') > F.lit('2022-03-01')) & (F.col('Potential_outlier') == True)).groupBy('merchant_abn').agg(F.round(F.sum('BNPL_Revenue'), 2).alias('Total_BNPL_Revenue_6MON_Fraud'), F.count('BNPL_Revenue').alias('Total_Transactions_6MON_Fraud'), F.round(F.sum('dollar_value'), 2).alias('Total_Dollar_value_6MON_Fraud')), on=['merchant_abn'])

In [4]:
# Get proportion values
final_data_collection = final_data_collection.withColumn('Total_BNPL_Revenue_FP', F.col('Total_BNPL_Revenue_Fraud') / (F.col('Total_BNPL_Revenue') + F.col('Total_BNPL_Revenue_Fraud')))
final_data_collection = final_data_collection.withColumn('Total_Dollar_value_FP', F.col('Total_BNPL_Revenue_Fraud') / (F.col('Total_Dollar_value') + F.col('Total_BNPL_Revenue_Fraud')))
final_data_collection = final_data_collection.withColumn('Total_Transactions_FP', F.col('Total_Transactions_Fraud') / (F.col('Total_Transactions') + F.col('Total_Transactions_Fraud')))
final_data_collection = final_data_collection.withColumn('Total_BNPL_Revenue_6MON_FP', F.col('Total_BNPL_Revenue_6MON_Fraud') / (F.col('Total_BNPL_Revenue_6MON') + F.col('Total_BNPL_Revenue_6MON_Fraud')))
final_data_collection = final_data_collection.withColumn('Total_Dollar_value_6MON_FP', F.col('Total_BNPL_Revenue_6MON_Fraud') / (F.col('Total_Dollar_value_6MON') + F.col('Total_BNPL_Revenue_6MON_Fraud')))
final_data_collection = final_data_collection.withColumn('Total_Transactions_6MON_FP', F.col('Total_Transactions_6MON_Fraud') / (F.col('Total_Transactions_6MON') + F.col('Total_Transactions_6MON_Fraud')))

In [5]:
final_data_collection = final_data_collection.select('merchant_abn', 'Total_BNPL_Revenue', 'Total_Dollar_value', 'Total_Transactions', 'Total_BNPL_Revenue_6MON', 'Total_Dollar_value_6MON', 'Total_Transactions_6MON', 'Total_BNPL_Revenue_FP', 'Total_Dollar_value_FP', 'Total_Transactions_FP', 'Total_BNPL_Revenue_6MON_FP', 'Total_Dollar_value_6MON_FP', 'Total_Transactions_6MON_FP')

In [6]:
for col_name in final_data_collection.columns[1:]:
    values = final_data_collection.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
    final_data_collection = final_data_collection.withColumn(col_name, (F.col(col_name) - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]))

                                                                                

In [7]:
# Now need to perform Revenue and Risk score
final_data_collection = final_data_collection.withColumn('Revenue_Score', F.col('Total_BNPL_Revenue') + F.col('Total_Dollar_value') + F.col('Total_Transactions') +
                                                  F.col('Total_BNPL_Revenue_6MON') + F.col('Total_Dollar_value_6MON') + F.col('Total_Transactions_6MON'))
# And Risk Revenue score 
final_data_collection = final_data_collection.withColumn('Risk_Revenue_Score', -1 * (F.col('Total_BNPL_Revenue_FP') + F.col('Total_Dollar_value_FP') + F.col('Total_Transactions_FP') +
                                                  F.col('Total_BNPL_Revenue_6MON_FP') + F.col('Total_Dollar_value_6MON_FP') + F.col('Total_Transactions_6MON_FP')))

In [8]:
final_data_collection = final_data_collection.select(['merchant_abn', 'Revenue_Score', 'Risk_Revenue_Score'])

# Merchant Sustainability
Next, we add a rating for a companies growth

In [9]:
merchants_sub = merchants.select(['merchant_abn', 'avg_monthly_inc', 'postcode_entropy'])

In [10]:
for col_name in merchants_sub.columns[1:]:
    values = merchants_sub.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
    merchants_sub = merchants_sub.withColumn(col_name, (F.col(col_name) - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]))

In [11]:
merchants_sub = merchants_sub.withColumn('Sustainability_score', F.col('avg_monthly_inc') - F.col('postcode_entropy'))

In [12]:
merchants_sub = merchants_sub.select(['merchant_abn', 'Sustainability_score'])
final_data_collection = final_data_collection.join(merchants_sub, on=['merchant_abn'])

# Customer Rankings
In this sections we use the features:
- customer_loyalty_agg
- unique_cust
- Fraud_cust
- Cust_tax

In [13]:
# Create Loyalfy feature
grouped = full_dataset.groupBy("user_id", "merchant_abn")
RPR = grouped.count().withColumnRenamed("count", "RPR")
upSell = RPR.groupBy("user_id").count().withColumnRenamed("count", "upsell")
CLV = grouped.sum("dollar_value").withColumnRenamed("sum(dollar_value)", "CLV")
# Define the window
from pyspark.sql.window import Window
w = Window.partitionBy(["user_id", "merchant_abn"]).orderBy("order_datetime")

retention = full_dataset.withColumn(
    'diff',
    F.datediff(F.col("order_datetime"), F.lag("order_datetime").over(w))
).groupBy("user_id", "merchant_abn").agg(F.avg(F.col("diff")).alias("retention"))
retention.agg({"retention":"max"}).collect()
retention.na.fill(value=365)
loyal = retention.na.fill(value=365).join(RPR, on=["user_id", "merchant_abn"], how="left").join(CLV, on=["user_id", "merchant_abn"], how="left").join(upSell, on=["user_id"], how="left")
loyal = loyal.withColumn("loyal", F.col("RPR") * F.col("CLV") * F.col("upSell") / F.col("retention"))

                                                                                

In [14]:
from pyspark.sql.functions import col
loyal = loyal.select(*(col(c).cast("float").alias(c) for c in loyal.columns))
loyal = loyal.select('user_id', 'merchant_abn', 'loyal')

In [15]:
# Join to dataset
# full_dataset = full_dataset.join(loyal, on=['user_id', 'merchant_abn'])

In [16]:
customer_attr =  full_dataset.groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Unique_Customers'), F.round(F.count('user_id') / F.countDistinct('user_id'), 2).alias('Transaction_per_User'), F.round(F.avg('Proportion_Unreg_Merchant_Transactions'),2).alias('Customer_Defects'), F.round(F.avg('Average taxable income or loss'),2).alias('customer_wealth'))

In [17]:
for col_name in customer_attr.columns[1:]:
    values = customer_attr.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
    customer_attr = customer_attr.withColumn(col_name, (F.col(col_name) - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]))

                                                                                

In [25]:
customer_attr = customer_attr.withColumn('Customer_score', F.col('customer_wealth') + F.col('Unique_Customers') + F.col('Transaction_per_User') - F.col('Customer_Defects'))
customer_attr = customer_attr.select(['merchant_abn', 'Customer_score'])
final_data_collection = final_data_collection.join(customer_attr, on=['merchant_abn'])

In [27]:
# Finally get the score
final_data_collection = final_data_collection.withColumn('Merchant_score', F.col('Customer_score') + F.col('Sustainability_score') + F.col('Revenue_Score') + F.col('Risk_Revenue_Score'))

In [30]:
final_data_collection.orderBy(F.col("Merchant_score").desc())

                                                                                

merchant_abn,Revenue_Score,Risk_Revenue_Score,Sustainability_score,Customer_score,Merchant_score
86578477987,5.651437445087812,-0.3647419261481112,-0.9903022207141438,2.017482417230664,6.313875715456221
89726005175,4.984314995435019,-0.2497744360884568,-0.9418296612462292,1.808005612930851,5.600716511031185
24852446429,4.508609850673027,-0.3158409047254606,-1.0,2.07978456470488,5.272553510652447
43186523025,4.496262931086211,-0.2357561116522806,-0.8727155689361166,1.7533577332664931,5.141148983764307
45629217853,4.770667057336847,-0.71256249436323,-0.9429787346530488,1.8517847831315355,4.966910611452104
21439773999,4.473698759325553,-0.3393044971453385,-0.8592531915330186,1.4515738677932404,4.726714938440437
32361057556,4.464353916203227,-0.2505845758686721,-0.8363015104670249,1.3069910392643511,4.684458869131881
94493496784,4.100574057181549,-0.3137406661371072,-0.8491817453948692,1.364587100639533,4.302238746289105
79417999332,3.943237669337018,-0.2629902664753277,-0.8528190332895881,1.3670467043010446,4.194475073873147
96680767841,4.007365585028571,-0.2806292440529717,-0.8018286799806218,0.8931023376076115,3.818009998602589
