# Merchant Ranking Algorithm
The method of ranking each merchant, will take inspiration from the methods proposed in (https://sapinsider.org/leveraging-analytical-method-for-ranking-suppliers/), in which we scale each sub_attribute with min-max norminalisation, then sum them together to get the overall score for a particular Metric

The Key Attributes/Metrics are: 
- Finantial 
- Customer_Base 
- Sustainability

In [94]:
# All functions (probably already implemented from other processing)
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import feature as H
from pyspark.sql.functions import coalesce
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql import Window
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [95]:
# This opens the spark session 
def open_spark():
    spark = (
    SparkSession.builder.appName("Data_Explorer")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "6g")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate())
    return spark

# Previously Unused Functions
For some reason these functions are not applied in the process of obtaining the processed/merhcants.parquet file, which will be requierd to be added to ensure this notebook/functions work

In [96]:
# This is used to standardise the description of each merchant
def text_process(text):
    stemmer = WordNetLemmatizer()
    # Remove all punctuation and numbers 
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    # Remove all stopwords
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    # lemmatize and output
    return ' '.join([stemmer.lemmatize(word) for word in nopunc])

# this function standardises the tags attribute, creating a list with the 'description', 'revenue band' and 'BNPL service charge'
def tag_extract(tag_string): 
    # first need to preprocess
    string =  re.sub('\[','(', tag_string.lower())
    string = re.sub('\]',')', string)
    # break the string into sections
    string_cut = string.split('),')
    new_string = []
    # first extract the description and pre process
    descr = str(string_cut[0].strip('(('))
    new_string.append(text_process(descr))
    # second extract the band
    new_string.append(str(re.search(r'[a-z]',string_cut[1]).group()))
    # finally the take rate
    new_string.append(float(re.search(r'[0-9]+\.[0-9]+',string_cut[2]).group()))
    return(new_string)

# This takes the dataset, and assigns the tag an unique integer value
def description_convert(processed_tags):
    # first work out how many tags in list 
    # Now assign each tag a value 
    assign = 0
    lookup_tags = {}
    for tag in processed_tags:
        if tag not in lookup_tags.keys():
            lookup_tags[tag] = assign
            assign = assign + 1
    # Write lookup table
    with open('../data/tables/description_lookup.csv', 'w') as f:
        for key in lookup_tags.keys():
           f.write("%s, %s\n" % (key, lookup_tags[key]))
    # now convert values
    new_tags = []
    for tag in processed_tags:
        new_tags.append(lookup_tags[tag])
    return new_tags

# This function takes the pandas dataframe containing merchant information and pre_processes it
def merchant_process(merchants, spark):
    merchants = merchants.toPandas()
    # Lets process the tags
    tags = merchants['tags']
    processed_tags = []
    for i in tags:
        processed_tags.append(tag_extract(i))
    merchant_tbl = pd.DataFrame(processed_tags, columns=('Description', 'Earnings_Class', 'BNPL_Fee'))
    merchant_tbl['Description'] = description_convert(list(merchant_tbl['Description'].values))
    merchant_tbl = pd.concat([merchants, merchant_tbl], axis=1)
    # drop the tags column 
    merchant_tbl.drop(columns='tags', inplace=True)
    merchant_tbl = spark.createDataFrame(merchant_tbl)
    return merchant_tbl

# Everything new

In [97]:
# Define our scaling function
def feature_standardisation(dataset, max_columns, min_columns = False): 
    # if higher values are prevered
    for col_name in max_columns:
        values = dataset.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
        dataset = dataset.withColumn(col_name, F.round((F.col(col_name) - values.select('low').head()[0]) / (values.select('high').head()[0] - values.select('low').head()[0]), 4))
    # if lower values are prefered
    if min_columns != False:
        for col_name in min_columns:
            values = dataset.select(F.max(col_name).alias('high'), F.min(col_name).alias('low'))
            dataset = dataset.withColumn(col_name, F.round((values.select('high').head()[0] - F.col(col_name)) / (values.select('high').head()[0] - values.select('low').head()[0]), 4))
    return dataset

# 1. Finacial Metric
For this we will look into:
- Total_Revenue
- Average_Revenue_Growth

These are both taken over the last 6 months

In [98]:
# This function calculates the revenue growth score
def revenue_growth_score(revenue_data):
    # Revenye Growth
    revenue_data_table = revenue_data.select('merchant_abn').distinct()
    # now for the past 6 monts calculate THIS IS HARD CODED
    months = [3,4,5,6,7,8,9]
    for month in months:
        end_month = month + 1
        if end_month < 10:
            revenue_data_table = revenue_data_table.join(revenue_data.where((F.col('order_datetime') < F.lit('2022-0' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.sum('BNPL_weighted_Revenue').alias('Month_' + str(month))), on='merchant_abn', how='left')
        else:
            revenue_data_table = revenue_data_table.join(revenue_data.where((F.col('order_datetime') < F.lit('2022-' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.sum('BNPL_weighted_Revenue').alias('Month_' + str(month))), on='merchant_abn', how='left')
    revenue_data_table = revenue_data_table.fillna(1)
    # Now need to calculate growth rate for each month
    revenue_data_table = revenue_data_table.withColumn('Growth_4', (F.col('Month_4') - F.col('Month_3')) / F.col('Month_4'))
    revenue_data_table = revenue_data_table.withColumn('Growth_5', (F.col('Month_5') - F.col('Month_4')) / F.col('Month_5'))
    revenue_data_table = revenue_data_table.withColumn('Growth_6', (F.col('Month_6') - F.col('Month_5')) / F.col('Month_6'))
    revenue_data_table = revenue_data_table.withColumn('Growth_7', (F.col('Month_7') - F.col('Month_6'))/ F.col('Month_7'))
    revenue_data_table = revenue_data_table.withColumn('Growth_8', (F.col('Month_8') - F.col('Month_7')) / F.col('Month_8'))
    revenue_data_table = revenue_data_table.withColumn('Growth_9', (F.col('Month_9') - F.col('Month_8')) / F.col('Month_9'))
    revenue_data_table = revenue_data_table.withColumn('Revenue_Growth_Avg', F.round((F.col('Growth_4') + F.col('Growth_5') + F.col('Growth_6') + F.col('Growth_7') + F.col('Growth_8') + F.col('Growth_9')) / 6 , 4))
    revenue_data_table = revenue_data_table.select('merchant_abn', 'Revenue_Growth_Avg')
    return revenue_data_table

# This function scores each merchant finantially (i.e. Finantial Score)
def finantial_score(RECENCY, transactions, fraud_data):
    # first d
    revenue_data = transactions.where(F.col('order_datetime') > RECENCY).select('merchant_abn','user_id', 'order_datetime', 'dollar_value', 'BNPL_Revenue')
    # Now join fraud prob 
    revenue_data = revenue_data.join(fraud_data, on=['merchant_abn', 'user_id', 'order_datetime'], how='left')
    # Firstly, calculate the revenue score

    # Weight the revenue weighted with probability
    revenue_data = revenue_data.withColumn('BNPL_weighted_Revenue', F.col('BNPL_Revenue') * (1 - 0.1 * F.col('prediction'))
                                                                 )
    # Now can caluculate each Merchants revenue
    revenue_final = revenue_data.groupBy('merchant_abn').agg(F.round(F.sum('BNPL_weighted_Revenue'), 2).alias('Total_Revenue'))
    # Next, get the revenue growth score 
    revenue_final = revenue_final.join(revenue_growth_score(revenue_data), on='merchant_abn', how='left')
    # Finally scale
    revenue_final = feature_standardisation(revenue_final, ['Revenue_Growth_Avg', 'Total_Revenue'])
    # save file 
    revenue_final.write.parquet('../data/Normalised/revenue_scaled1', mode ='overwrite')

# Sustainability 
- Experience
- Customer Growth (Last 6 months)
- Postcode_Entropy
- Industry score (added when scoring each metric [next notebook])

In [99]:
def customer_growth(full_dataset):
    # First grab total Unique customers 
    Unique_cust = full_dataset.where(F.col('order_datetime') < F.lit('2022-05-01')).groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Unique_Customers'))
    # now for the past 6 monts calculate 
    months = [5,6,7,8,9,10]
    customer_counting_data = full_dataset.select('merchant_abn', 'user_id', 'order_datetime', 'month')
    for month in months:
        end_month = month + 1
        if end_month < 10:
            Unique_cust = Unique_cust.join(customer_counting_data.where((F.col('order_datetime') < F.lit('2022-0' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Month_' + str(month))), on='merchant_abn', how='left')
        else:
            Unique_cust = Unique_cust.join(customer_counting_data.where((F.col('order_datetime') < F.lit('2022-' + str(end_month)+'-01'))).groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Month_' + str(month))), on='merchant_abn', how='left')
    Unique_cust = Unique_cust.fillna(1)
    # Now need to calculate growth rate for each month
    Unique_cust = Unique_cust.withColumn('Growth_5', F.col('Unique_Customers') / F.col('Month_5'))
    Unique_cust = Unique_cust.withColumn('Growth_6', (F.col('Month_6') - F.col('Month_5')) / F.col('Month_6'))
    Unique_cust = Unique_cust.withColumn('Growth_7', (F.col('Month_7') - F.col('Month_6'))/ F.col('Month_7'))
    Unique_cust = Unique_cust.withColumn('Growth_8', (F.col('Month_8') - F.col('Month_7')) / F.col('Month_8'))
    Unique_cust = Unique_cust.withColumn('Growth_9', (F.col('Month_9') - F.col('Month_8')) / F.col('Month_9'))
    Unique_cust = Unique_cust.withColumn('Growth_10',(F.col('Month_10') - F.col('Month_9')) / F.col('Month_10'))
    # Finally get the average customer growth rate over the last 6 months
    Unique_cust = Unique_cust.withColumn('Customer_Growth_Avg', F.round((F.col('Growth_5') + F.col('Growth_6') + F.col('Growth_7') + F.col('Growth_8') + F.col('Growth_9') + F.col('Growth_10')) / 6 , 4))
    Unique_cust = Unique_cust.withColumn('Customer_Growth_Avg', F.when(F.col('Customer_Growth_Avg').isNull(), 0).otherwise(F.col('Customer_Growth_Avg')))
    return Unique_cust.select('merchant_abn', 'Customer_Growth_Avg')

def sustainability_score(merchants, full_dataset):
    merchants_sub = merchants.select(['merchant_abn', 'postcode_entropy'])
    merchants_sub = merchants_sub.join(customer_growth(full_dataset), on='merchant_abn', how='left')
    # calculate experience
    number_of_dates_by_merchant = full_dataset.groupBy('merchant_abn','order_datetime').count()
    number_of_dates_by_merchant = number_of_dates_by_merchant.drop('order_datetime')
    number_of_dates_by_merchant = number_of_dates_by_merchant.groupBy('merchant_abn').sum('count')
    merchants_sub = merchants_sub.join(number_of_dates_by_merchant.withColumnRenamed('sum(count)','Total_Business_Days'), on='merchant_abn', how='left')
    merchants_final = feature_standardisation(merchants_sub, ['Total_Business_Days', 'Customer_Growth_Avg'], ['postcode_entropy'])
    # save file 
    merchants_final.write.parquet('../data/Normalised/sustainability_scaled1', mode = 'overwrite')

# Customer Base
In this sections we use the features:
- Transaction_per_User
- customer_wealth
- Unique_Customers
- Loyal_AVG
- postcode_entropy

In [100]:
def loyalty(full_dataset):
    # Create Loyalfy feature
    grouped = full_dataset.groupBy("user_id", "merchant_abn")
    RPR = grouped.count().withColumnRenamed("count", "RPR")
    upSell = RPR.groupBy("user_id").count().withColumnRenamed("count", "upsell")
    CLV = grouped.sum("dollar_value").withColumnRenamed("sum(dollar_value)", "CLV")
    # Define the window
    w = Window.partitionBy(["user_id", "merchant_abn"]).orderBy("order_datetime")

    retention = full_dataset.withColumn(
        'diff',
        F.datediff(F.col("order_datetime"), F.lag("order_datetime").over(w))
    ).groupBy("user_id", "merchant_abn").agg(F.avg(F.col("diff")).alias("retention"))
    loyal = retention.na.fill(value=365).join(RPR, on=["user_id", "merchant_abn"], how="left").join(CLV, on=["user_id", "merchant_abn"], how="left").join(upSell, on=["user_id"], how="left")
    loyal = loyal.withColumn("loyal", F.col("RPR") * F.col("CLV") * F.col("upSell") / F.col("retention"))
    loyal = loyal.select(*(col(c).cast("float").alias(c) for c in loyal.columns))
    loyal = loyal.select('user_id', 'merchant_abn', 'loyal')
    magic_percentile = F.expr('percentile_approx(loyal, 0.5)')
    loyal = loyal.join(loyal.groupBy('user_id').agg(magic_percentile.alias('med_val')), on='user_id', how='left')
    loyal = loyal.withColumn('loyal', coalesce('loyal', 'med_val')).select(['user_id', 'merchant_abn', 'loyal'])
    loyal_agg = loyal.groupBy('merchant_abn').agg(F.round(F.avg('loyal'), 2).alias('Loyal_AVG'))
    return loyal_agg

In [101]:
def customer_entropy(full_dataset):
    # Get customer Entropy 
    cust_entro = full_dataset.select('merchant_abn', 'user_id', 'postcode', 'order_datetime')
    Entropy_cust = cust_entro.select('merchant_abn').toDF('merchant_abn').drop_duplicates().crossJoin(cust_entro.select('postcode').toDF('postcode').drop_duplicates())
    Entropy_cust = Entropy_cust.join(cust_entro.groupBy('merchant_abn', 'postcode').agg(F.countDistinct('user_id').alias('Count')), on=['merchant_abn', 'postcode'], how='left')
    Entropy_cust = Entropy_cust.fillna(1)
    Entropy_cust = Entropy_cust.join(Entropy_cust.groupBy("merchant_abn").sum('Count'), on='merchant_abn', how='left')
    Entropy_cust = Entropy_cust.withColumn('Probability', F.col('Count') / F.col('sum(Count)'))
    Entropy_cust = Entropy_cust.groupBy("merchant_abn").agg((-F.sum(F.col("Probability") * F.log2(col("Probability")))).alias("Customer_Entropy"))
    return Entropy_cust

In [102]:
def customer_score(full_dataset, customers):
    full_dataset = full_dataset.join(customers, on='user_id')
    # add some of the attributes
    cust_data = full_dataset.groupBy('merchant_abn').agg(F.countDistinct('user_id').alias('Unique_Customers'), F.round(F.count('user_id') / F.countDistinct('user_id'), 2).alias('Transaction_per_User'), F.round(F.avg('Average taxable income or loss'),2).alias('customer_wealth'))
    # add the loyalty score
    cust_data = cust_data.join(loyalty(full_dataset), on='merchant_abn')
    cust_data = cust_data.join(customer_entropy(full_dataset), on='merchant_abn', how='left')
    cust_final = feature_standardisation(cust_data, ['Transaction_per_User', 'customer_wealth', 'Unique_Customers', 'Loyal_AVG'], ['Customer_Entropy'])
    # save file 
    cust_final.write.parquet('../data/Normalised/customer_scaled1', mode = 'overwrite')

# Industry/Environment 
We add a general score, based on their description tags, in relation to which industry they belong to, including:
- Market Dominance
- Survival Rate
- Fraud_Prob_Avg
- Customer_Base
- Growth

In [103]:
def description_transformation(spark):
    # Read lookup table 
    description_lookup = spark.read.csv('../data/tables/description_lookup.csv')
    description_lookup = description_lookup.withColumnRenamed('_c0', 'Full_description')
    description_lookup = description_lookup.withColumnRenamed('_c1', 'Description_Key')
    # Next assign each to one of the 4 industries
    #Administrative_Support_Services= ["equipment tool furniture appliance rent al leasing", "florist supply nursery stock flower", "lawn garden supply outlet including nursery"]
    Administrative_Support_Services = [8, 10, 23]
    #Personal_Services= ["shoe shop", "gift card novelty souvenir shop", "antique shop sale repair restoration service", "watch clock jewellery repair shop", "jewellery watch clock silverware shop",  "motor vehicle supply new part", "furniture home furnishing equipment shop manufa...", "tent awning shop", "optician optical good eyeglass"]
    Personal_Services = [19, 5, 11, 3, 2, 12, 0, 15, 20]
    #Arts_Recreation_Services = ["digital good book movie music", "music shop musical instrument piano sheet music", "health beauty spa", "bicycle shop sale service", "art dealer gallery", "hobby toy game shop", "stationery office supply printing writing paper"]
    Arts_Recreation_Services = [18, 4, 21, 17, 16, 22, 14]
    #Information_Media_Telecommunications = ["telecom", "computer programming data processing integrated...", "book periodical newspaper", "artist supply craft shop", "computer computer peripheral equipment software", "cable satellite pay television radio service"]
    Information_Media_Telecommunications=[24, 6,7,9,1,13]
    desc = []
    tags = []
    for ele in Administrative_Support_Services:
        desc.append("Administrative_Support_Services")
    tags += Administrative_Support_Services
    for ele in Personal_Services:
        desc.append("Personal_Services")
    tags += Personal_Services
    for ele in Arts_Recreation_Services:
        desc.append("Arts_Recreation_Services")
    tags += Arts_Recreation_Services
    for ele in Information_Media_Telecommunications:
        desc.append("Information_Media_Telecommunications")
    tags += Information_Media_Telecommunications

    to_df = {
        "sector" :  desc,
        "Description" : tags
    }
    # write dictionary
    pd.DataFrame(to_df).to_csv('../data/curated/sector_lookup_table.csv')
    sectors = spark.createDataFrame(pd.DataFrame(to_df))
    dataset_sector = description_lookup.join(sectors, on= description_lookup.Description_Key == sectors.Description).drop('Description')
    # save sectors
    return dataset_sector

In [104]:
def industry(full_dataset, fraud_probabilities, spark, RECENCY):
    industry_data = full_dataset.select('merchant_abn','user_id', 'order_datetime', 'dollar_value', 'BNPL_Revenue', 'Description')
    # Now join fraud prob 
    industry_data = industry_data.join(fraud_probabilities, on=['merchant_abn', 'user_id', 'order_datetime'], how='left')
    industry_data = industry_data.withColumn('BNPL_weighted_Revenue', F.col('BNPL_Revenue') * (1 - 0.1 * F.col('prediction'))
                                                                 )
    dataset_sector = description_transformation(spark)
    industry_data = industry_data.join(dataset_sector, on=industry_data.Description == dataset_sector.Description_Key, how='left').drop('Full_Description', 'Description_Key')
    # Define the toggle 
    #SECTOR_SIZE = 'Description'
    # or 
    SECTOR_SIZE = 'sector'
    Merchant_Industry = industry_data.select('merchant_abn', 'sector').distinct()
    # Market Dominance
    industry_table = industry_data.where(F.col('order_datetime') > RECENCY).groupBy('sector').agg(F.sum('BNPL_weighted_Revenue').alias('Total_Weighted_Revenue'), F.count('BNPL_weighted_Revenue').alias('Total_Transactions'), F.avg('BNPL_weighted_Revenue').alias('Average_Weighted_Revenue'))
    total_rev = industry_table.groupBy().sum().collect()[0][0]
    total_transa = industry_table.groupBy().sum().collect()[0][0]
    industry_table = industry_table.withColumn('Total_Weighted_Revenue', F.col('Total_Weighted_Revenue') / total_rev).withColumnRenamed('Total_Weighted_Revenue', 'Portion_of_Total_Revenue')
    industry_table = industry_table.withColumn('Total_Transactions', F.col('Total_Transactions') / total_transa).withColumnRenamed('Total_Transactions', 'Portion_of_Total_Transactions')
    # Survival Rate
    sector_info = spark.read.parquet('../data/tables/sector_information/')
    industry_table = industry_table.join(sector_info.select(F.col('sector'), F.col('survival_rate').cast('double').alias("survival_rate")), on='sector', how='left')
    industry_table = feature_standardisation(industry_table, ['Portion_of_Total_Revenue', 'Portion_of_Total_Transactions', 'survival_rate', 'Average_Weighted_Revenue'])
    # ensure assigned to each merchant
    industry_table = Merchant_Industry.join(industry_table, on='sector', how='left')
    # save file 
    industry_table.write.parquet('../data/Normalised/industry_scaled1', mode = 'overwrite')

In [105]:
# THis is the main function, which can run everything
def main():
    spark = open_spark()
    customers = spark.read.parquet('../data/processed/customers/')
    transactions = spark.read.parquet('../data/processed/transactions')
    merchants =  spark.read.parquet('../data/processed/merchants/')
    final_data_collection = merchants.select('merchant_abn')
    fraud_probabilities = spark.read.parquet('../models/random_forest_output_full/')
    fraud_probabilities = fraud_probabilities.select('merchant_abn', 'user_id', 'order_datetime', 'prediction')

    # APPLY PREVIOUSELY SHARED & UNIMPLEMENTED FUNCTIONS
    merchants = merchant_process(merchants, spark)
    full_dataset = transactions.join(merchants, on='merchant_abn', how='inner')
    # calculate the BNPL unweighted revenue 
    full_dataset = full_dataset.withColumn('BNPL_Revenue', F.col('dollar_value') * F.col('BNPL_Fee'))
    # First calculate the finantial score

    # We will for now determine as last 6 months (i.e march) 
    RECENCY =  F.lit('2022-03-01')
    # Now run each score, which will save results to file
    # Note: customer score takes much linger, and sould probably run seperate
    finantial_score(RECENCY, transactions, fraud_probabilities)
    sustainability_score(merchants, full_dataset)
    customer_score(full_dataset, customers)
    industry(full_dataset, fraud_probabilities, spark, RECENCY)
    

In [106]:
main()

                                                                                