In [1]:
import os
os.sys.path.append("../")
from scripts.ranking_model_v2 import *

2024-10-04 00:11:00.512414: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 00:11:00.564078: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 00:11:00.564941: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#  Create a Spark Session
spark = (
    SparkSession.builder.appName("Ranking Model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

24/10/04 00:11:04 WARN Utils: Your hostname, DESKTOP-H6V94HM resolves to a loopback address: 127.0.1.1; using 192.168.0.220 instead (on interface eth0)
24/10/04 00:11:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/04 00:11:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Ranking System

In [3]:
# Reading transactions
transactions = spark.read.parquet(f"../data/curated/transactions.parquet")
transactions = transactions.withColumns(
    {"period": F.date_format(F.col("order_datetime"), "yyyy-MM")})
transactions = transactions.drop("merchant_fp", "consumer_fp")

                                                                                

In [4]:
# Reading predicted fraud probabilities
merchant_fp = spark.read.parquet(f"../data/curated/predicted_merchant_fp.parquet")
consumer_fp = spark.read.parquet(f"../data/curated/predicted_consumer_fp.parquet")

# Join with transaction data
transactions_full = transactions.join(consumer_fp, on = ['consumer_id', 'order_datetime', 'order_id'], how = 'inner')
transactions_full = transactions_full.join(merchant_fp, on = ['merchant_abn', 'order_datetime', 'order_id'], how = 'inner')
# transactions_full.limit(5)

As there are many ways in ranking the top 100 merchants toi join the BNPL firm, we decided adopted the approach of an investor, consider each merchant as a "projects" and evaluating their value. To evluate the merchant's value, we will be using a modified version of **Discounted Cash Flow (DCF)** model where we substitute *cash flow* with *revenue*. The orignal DCF model has the below formula

$$ \text{DCF} = \sum^{n}_{t=1}\frac{CF_t}{(1+r)^t}$$

where $t$ is the time period of cash flow and $r$ is the discount rate.

The DCF model we based on is the one that use **Free Cash Flows (FCF)** from **Earning Before Interest and Tax (EBIT)**. EBIT are usually a percentage of sales revenue and in Corporate Financial Decision Making (FNCE20003), the formula for FCF is defined as

$$ \text{FCF} = \text{EBIT}(1-t) + \text{Depreciation} - \text{Capital Expenditure} - \Delta\text{Working Capital}$$

Since the BNPL charges merchant per transaction, this means that the firm is charging for a percentage of the sales revenue. Thus, the merchant doesn't pay the BNPL firm through EBIT or their FCF. This allows us to safely consider the percentage of revenue of the merchant as cash flows for the BNPL firm.

We will calculate of the project's  DCF using revenues from September 2022, October 2022, and November 2022. The value of the DCF is then multipled by the take rate, which we will called **Expected Project Value (EPV)**. After that, we will assign weights or penalties to the DCF and pick the merchants with the highest DCF.

In [5]:
# Aggregate the data into desried format
agg_transactions = transactions_full.groupBy("merchant_abn", "period", "take_rate").agg(
    F.count(F.col("order_id")).alias("num_orders"),
    F.round(F.sum("dollar_value"),2).alias('revenue'),
    F.round(F.mean(F.col("dollar_value")), 2).alias("revenue_per_order"),
    F.round(F.mean(F.col("merchant_fp")), 2).alias("avg_merchant_fp"),
    F.round(F.mean(F.col("consumer_fp")), 2).alias("avg_consumer_fp")
)

agg_transactions = agg_transactions.orderBy(["merchant_abn", "period"], ascending = [False, True])

# Partition the data based on merchant ABN and comput lag variables for each specific partition
merchant_partition = Window.partitionBy("merchant_abn").orderBy("period")
agg_transactions = agg_transactions.withColumns({
    "revenue_lag_1": F.lag("revenue", 1, None).over(merchant_partition),
    "revenue_lag_2": F.lag("revenue", 2, None).over(merchant_partition),
    "revenue_lag_3": F.lag("revenue", 3, None).over(merchant_partition),
    "num_order_lag_1": F.lag("num_orders", 1, None).over(merchant_partition),
    "num_order_lag_2": F.lag("num_orders", 2, None).over(merchant_partition),
    "expected_profit": F.round(F.col("revenue") * F.col("take_rate")/100,2)
    })
agg_transactions = agg_transactions.withColumns({
    "revenue_growth":
        F.when(F.col("revenue_lag_1").isNotNull(), F.round((F.col("revenue") - F.col("revenue_lag_1"))/F.col("revenue_lag_1"), 2)).otherwise(None),
    "revenue_growth_lag_1": F.when(F.col("revenue_lag_2").isNotNull(), 
        F.round((F.col("revenue_lag_1") - F.col("revenue_lag_2"))/F.col("revenue_lag_2"), 2)).otherwise(None),
    "revenue_growth_lag_2": F.when(F.col("revenue_lag_3").isNotNull(), 
        F.round((F.col("revenue_lag_2") - F.col("revenue_lag_3"))/F.col("revenue_lag_3"), 2)).otherwise(None)})

Comparing different merchants' values using the DCF model is only reliable when all merchants have revenue in the timeline of interest i.e from May 2021 to August 2022, all merchants must have sales in each month. We noticed that there are some merchants with no revenue in some particular month. Thus, we decide to asume that the missing revenue in some moths of some merchants are totally due to the merchant's inability to make any sales and not because of data entry errors. We adopted the perspective that we want merchants that have consistent sales. We will only consider merchants with 15 months of revenue records, prior to the date of the last transaction entry.

In [6]:
# Create a dataframe that store the valid date range of the data
months = pd.date_range(start="2021-06-01", end="2022-08-31", freq='MS').strftime('%Y-%m').tolist()
months_df = spark.createDataFrame([(month,) for month in months], ["period"])

# Join with the transactions dataframe
transactions_correct_months = transactions.join(months_df, on= ['period'], how = 'right')

# Get the list of merchants with complete sales records
complete_merchants = transactions_correct_months.groupBy("merchant_abn").agg(F.countDistinct("period")\
                                                                             .alias("month_count")).filter(F.col("month_count") == len(months)).select("merchant_abn")
print(f"Number of merchants with complete sales records: {complete_merchants.count()}")



Number of merchants with complete sales records: 3212


                                                                                

In [7]:
# # Select the merchants with complete sales records from the aggregated sales 
# print(f"Number of entries before removing merchants with missing sales records: {agg_transactions.count()}")
# agg_transactions = agg_transactions.join(complete_merchants, on='merchant_abn', how='inner')
# agg_transactions = agg_transactions.filter("2021-06" <= F.col("period")).orderBy(['merchant_abn', 'period'],
#                                                                                       ascending = [True, True])
# print(f"Number of entries after removing merchants with missing sales records: {agg_transactions.count()}")

# # Export the aggregated transactions to reduce the need to run this block again
# agg_transactions.write.parquet(f"../data/curated/agg_transactions.parquet", mode = 'overwrite')

In [8]:
agg_transactions = spark.read.parquet(f"../data/curated/agg_transactions.parquet")

In [9]:
agg_transactions_sub = agg_transactions.select(*['merchant_abn', 'period', 'revenue', 'revenue_lag_1', 'revenue_lag_2', 'revenue_lag_3',
'revenue_growth_lag_1', 'revenue_growth_lag_2'])


Since the last month of sales record is August 2022, this means that we have to forecast the revenues of the next 3 months. We will try 2 different approaches for forecasting revenue of 3 periods into the future. The first approach is to use a simple LSTM to predict the revenue. Contrast to the machine learning nature of approach 1, the second approach will be simply to compute the average monthly revenue growth rate of the 15 months period and then assume that revenues after August 2022 will grow by the same amount.

In [10]:
for merchant in complete_merchants.rdd.flatMap(lambda x:x).collect():
    # Predict sales revenue 3 period for all valid merchants
    partition = agg_transactions_sub.filter(F.col("merchant_abn") == merchant)
    
    forecasted_revenue = forecast_revenue(partition)

    print(forecasted_revenue)
    break

                                                                                

Initial input values: [[[ 6.81565700e+04]
  [ 8.45487200e+04]
  [ 7.96017700e+04]
  [-1.93878157e-01]
  [ 6.00000000e-02]]]
[70386.86, 64892.07, 60938.344]


If we run the above code multiple times, we will see that the forecasted revenues will change with each rerun. This is because the weights between the units are randomly intialise each time and Stochastic Gradient Descent isn't guaranteed to always find the local maximum and minimum. Thus, in order to train a neural network that minimise the mean squared error, we would have to fine-tune our model. The process would be feasible if we're doing for 8-10 merchants. However, our data contains more than 3000 merchants which would be computationally expensive. Thus, we will stick with the second approach that was mentioned previously.

Let's aggregate our data and find the monthly average growth for each merchant. From there, we can compute the DCF for each of them

In [11]:
agg_transactions.limit(1)

merchant_abn,period,take_rate,num_orders,revenue,revenue_per_order,avg_merchant_fp,avg_consumer_fp,revenue_lag_1,revenue_lag_2,revenue_lag_3,num_order_lag_1,num_order_lag_2,expected_profit,revenue_growth,revenue_growth_lag_1,revenue_growth_lag_2
63937753588,2021-10,4.17,14,42480.29,3034.31,41.25,16.06,20691.97,41997.37,36180.46,8,12,1771.43,1.05,-0.51,0.16


In [12]:
merchant_avg_growth = agg_transactions.groupBy("merchant_abn", ).agg(
    F.mean(F.col('revenue_growth')).alias('avg_monthly_revenue_growth'),
    F.mean(F.col("num_orders")).alias("avg_num_orders"),
    F.mean(F.col("revenue_per_order")).alias('avg_revenue_per_order'),
    (F.stddev(F.col('revenue'))/F.mean(F.col('revenue'))).alias("coef_of_variation"),
    F.stddev(F.col('revenue_growth')).alias("std_reveune_growth")
)
merchant_latest_revenue = agg_transactions.filter(F.col("period") == '2022-08').select("merchant_abn", "revenue", "take_rate")
merchant_latest_revenue = merchant_latest_revenue.join(merchant_avg_growth, on ='merchant_abn', how = 'inner')
merchant_latest_revenue = merchant_latest_revenue.withColumns({
    "forecasted_revenue_1": F.col("revenue") * (1 + F.col("avg_monthly_revenue_growth")),
    "forecasted_revenue_2": F.col("revenue") * (1 + F.col("avg_monthly_revenue_growth"))**2,
    "forecasted_revenue_3": F.col("revenue") * (1 + F.col("avg_monthly_revenue_growth"))**3,
})

According to the Victoria State Government [website](https://djsir.vic.gov.au/about-us/overview/the-economic-assessment-information-portal/i-am-looking-for-guidance-on-particular-economic-assessment-processes,-methods-and-variables#:~:text=Department%20of%20Treasury%20and%20Finance,on%20the%20category%20of%20investment.), there is no single discount rate. As a general guideline, the discount rate is between 4% and 7%. We acknowledge that each merchant may have their own discount rates but it's quite extensive to give each and everyone of them an individual discount rate. Thus, we will use the mid point of the recommended range and apply it to all merchants.

In [13]:
discount_rate = 1.055

merchant_latest_revenue = merchant_latest_revenue.withColumns(
    {"discounted_revenue_flow": F.col("forecasted_revenue_1")/discount_rate + F.col("forecasted_revenue_2")/discount_rate**2 + F.col("forecasted_revenue_3")/discount_rate**3,
     "expected_project_value": F.col("discounted_revenue_flow") * F.col('take_rate')/100}
    )
merchant_latest_revenue = merchant_latest_revenue.drop("forecasted_revenue_1", "forecasted_revenue_2", "forecasted_revenue_3", "revenue", "take_rate")
merchant_latest_revenue.filter(F.col("merchant_abn") == 70344541271)

merchant_abn,avg_monthly_revenue_growth,avg_num_orders,avg_revenue_per_order,coef_of_variation,std_reveune_growth,discounted_revenue_flow,expected_project_value
70344541271,51.08466666666667,4.866666666666666,1081.5886666666663,0.5737462225685777,195.37035923989032,722966459.4670486,41281384.83556847


We will also be using the predicted fraud probability for both merchants and consumer as part of our ranking system. We will assign our weights of choice, $\alpha$ and $\beta$ to the merchants and consumers' fraud probability, respectively, and sum them. We think that the merchant's fraud probability is more important to the BNPL firm as a merchant with a higher fraud probability with likely to commit scams, thus damaging the BNPL firm's reputation and customer will not shop from the firm anymore. Whereas for customer, it's easier to assess the risk of a customer not paying for the items. Thus, we decided the weight is going to be $\alpha = 0.65$ and $\beta = 0.35$. The formula for the combined fraud probability is:

$$\text{Combined Fraud Probability (CBF)} = \alpha \times \text{Merchant's FP} + \beta\times\text{Consumer's FP}$$

We will use the combined fraud probability to get the fraud-adjusted DCF

$$ \text{Fraud-adjusted EPV} = (1 - \text{CBF}) \times \text{EPV}

In [14]:
avg_fp = agg_transactions.withColumns({
    "total_merchant_fp": F.col("avg_merchant_fp") * F.col("num_orders"),
    "total_consumer_fp": F.col("avg_consumer_fp") * F.col("num_orders"),
}).select("merchant_abn", "period", "num_orders", "total_merchant_fp", "total_consumer_fp")

avg_fp = avg_fp.groupBy("merchant_abn").agg(
    (F.sum(F.col("total_merchant_fp"))/F.sum(F.col("num_orders"))).alias("avg_merchant_fp"),
    (F.sum(F.col("total_consumer_fp"))/F.sum(F.col("num_orders"))).alias("avg_consumer_fp")
)

In [15]:
alpha = 0.65
beta = 0.35

avg_fp = avg_fp.withColumn("combined_fp", alpha * F.col("avg_merchant_fp") + beta * F.col('avg_consumer_fp'))
merchant_ranking_metrics = merchant_latest_revenue.join(avg_fp, on='merchant_abn', how='inner')
merchant_ranking_metrics = merchant_ranking_metrics.drop("avg_merchant_fp","avg_consumer_fp")

In [16]:
merchant_ranking_metrics = merchant_ranking_metrics.withColumn(
    "risk_adjusted_epv",
    F.col("expected_project_value") * (1 - F.col("combined_fp")/100)
)

In [17]:
merchant_ranking_metrics.orderBy("std_reveune_growth", ascending = False).limit(10)

                                                                                

merchant_abn,avg_monthly_revenue_growth,avg_num_orders,avg_revenue_per_order,coef_of_variation,std_reveune_growth,discounted_revenue_flow,expected_project_value,combined_fp,risk_adjusted_epv
70344541271,51.08466666666667,4.866666666666666,1081.5886666666663,0.5737462225685777,195.37035923989032,722966459.4670486,41281384.83556847,42.42405479452055,23768147.512990005
71616292306,27.928666666666665,2.6666666666666665,210.442,0.730322688378949,106.90505098762485,5417787.616379169,177161.6550555988,39.5622125,107072.5846139858
95276443363,15.658666666666663,3.6,296.6186666666666,0.6820969976318357,51.655349033856346,6601250.027970397,251507.6260656721,39.01125925925927,153391.33400438444
75547072158,12.812666666666669,3.8666666666666663,284.17533333333336,0.7476836101265073,48.9629811281174,253224.4641329908,11445.745778811182,39.041500000000006,6977.154940576614
12171241826,10.885333333333334,8.2,228.37266666666665,0.568344026863831,40.80740511697543,5073911.329712124,209552.5379171107,38.0224756097561,129875.47529795238
33223110337,10.853333333333332,2.8,214.402,0.909280805105494,38.67401336199554,463179.7602395222,21908.402659329404,38.604011904761904,13450.880288578708
70783350473,10.031333333333333,3.066666666666667,576.6173333333334,0.5781118668358438,36.41120272401783,2314209.098268511,42812.86831796745,38.95503260869566,26135.101503985294
52129470223,9.220000000000002,4.866666666666666,309.45399999999995,0.6017263523053834,33.47001707285407,3312653.5794925285,109317.56812325344,39.02289726027398,66658.68582708623
33790986203,7.944,5.2,327.7420000000001,0.6609718021645656,27.62230615178351,2327647.7691004816,140589.9252536691,41.86257051282051,81735.3686604302
51420872378,8.786666666666665,2.8,258.158,0.9624601796783046,23.669936829738464,30096.247715517915,990.1665498405392,38.90971428571428,604.8955743448707


From the table above, we can see that the merchant 1 has a risk-adjusted EPV of approximately $23 millions. This is due to their average monthly revenue growth rate is 5108% which is a massive growth. This is due to fact that this merchant has highly fluctuating monthly revenue growth rate which may affect the average monthly growth rate. Thus, it's important that we penalised merchant with high revenue growth standard deviation. We will use a modified Winsorizor method to remove any standard deviation of revenue growth that are outliers.

This is the intial approach for removing  merchant with unstable revenue growth rate. Double click the cell to view
<!-- The formula for calculating the weight is

$$W_{\text{Revenue growth}} = \frac{1}{\log (1+s^{\text{r.g}}_i)} - \frac{1}{2}$$

where $s^{\text{rev growth}}_i$ is the standard deviation of the merchant's revenue growth rate accross the 15 months period. -->

In [18]:
lower_bound = merchant_ranking_metrics.select(F.percentile(F.col("std_reveune_growth"), 0.01)).first()[0]
upper_bound = merchant_ranking_metrics.select(F.percentile(F.col("std_reveune_growth"), 0.99)).first()[0]

merchant_ranking_metrics = merchant_ranking_metrics.filter((lower_bound <= F.col("std_reveune_growth")) & (F.col("std_reveune_growth") <= upper_bound))

                                                                                

In [19]:
merchant_ranking_metrics.orderBy("risk_adjusted_epv", ascending = False).limit(10)

                                                                                

merchant_abn,avg_monthly_revenue_growth,avg_num_orders,avg_revenue_per_order,coef_of_variation,std_reveune_growth,discounted_revenue_flow,expected_project_value,combined_fp,risk_adjusted_epv
97884414539,1.6406666666666667,3.0,9981.536666666663,0.7347099651497621,4.043104300033871,1895396.73499406,129266.05732659488,44.04951111111112,72324.99104162121
48534649627,0.0253333333333333,3335.4666666666667,143.09733333333332,0.178519044395029,0.1612392484418581,1400750.4061942906,93009.82697130088,30.67343908898305,64480.51434849038
86578477987,0.026,13805.266666666666,35.028,0.1718032053660295,0.1635236985883086,1426866.798267028,91747.5351285699,30.291424461195973,63955.89983008991
32361057556,0.0253333333333333,4341.933333333333,109.82733333333334,0.1775619949607971,0.1705816380449409,1380419.1374607098,91245.70498615292,30.4234067389335,63485.65302640829
45629217853,0.0273333333333333,11255.333333333334,37.83866666666667,0.1871713589260297,0.1782641165184267,1265784.8255556792,88351.78082378641,30.81994237694723,61121.81288488873
79827781481,0.0286666666666666,241.2,2036.1873333333335,0.1758887478875903,0.1881438153065235,1417110.0615200887,96646.90619567005,38.2490462962963,59680.386300950144
21439773999,0.0246666666666666,6118.266666666666,78.28,0.1751334555808915,0.1642240049734624,1387461.9684095625,84635.1800729833,30.24625520844684,59036.20751198023
96680767841,0.0293333333333333,1582.3333333333333,315.58133333333336,0.1762583893366516,0.1899799488166605,1468326.4819790302,86778.09508496069,32.35259734569201,58703.12739786161
38700038932,0.0226666666666666,365.26666666666665,1337.1320000000003,0.1785537549379326,0.1787842541271728,1418536.365320908,89509.64465174929,34.81062803431283,58350.77519719366
64403598239,0.0253333333333333,5752.2,78.12666666666668,0.1718802095337848,0.1548209411207793,1323501.879621348,83512.96860410707,30.271728173568377,58232.14975859427


There are also some merchant with a very few amount of orders per month. From the BNPL perspective, we would want merchants with a decent amount of order volume per month as more volume would likely result in more revenue for BNPL firm. We also found that a low average monthly order volume also lead to unstable growth rate, thus, led to unrealistic forecasted revenues. Thus, we need to create a weight that penalises merchant with low order volume. The weight is compute using a Sigmoid function

$$ W_{\text{num orders}} = \frac{1}{1 + e^{-(\bar{x_i} - \bar{x_{.}})}}$$

where $\bar{x_i}$ is the average number of order of merchant $i$ and $\bar{x_.}$ is the average number of order of all merchants.

In [20]:
# Compute average monthly order volume of all merchants
mean_num_orders = agg_transactions.agg(F.mean("num_orders")).collect()[0][0]
mean_num_orders

212.6747613117476

In [21]:
merchant_ranking_metrics = merchant_ranking_metrics.withColumn("risk_adjusted_epv", F.col("risk_adjusted_epv") * (1/(1 + F.exp(-F.col("avg_num_orders"))*np.exp(mean_num_orders))))
merchant_ranking_metrics.orderBy("risk_adjusted_epv", ascending = False).limit(10)

                                                                                

merchant_abn,avg_monthly_revenue_growth,avg_num_orders,avg_revenue_per_order,coef_of_variation,std_reveune_growth,discounted_revenue_flow,expected_project_value,combined_fp,risk_adjusted_epv
48534649627,0.0253333333333333,3335.4666666666667,143.09733333333332,0.178519044395029,0.1612392484418581,1400750.4061942906,93009.82697130088,30.67343908898305,64480.51434849038
86578477987,0.026,13805.266666666666,35.028,0.1718032053660295,0.1635236985883086,1426866.798267028,91747.5351285699,30.291424461195973,63955.89983008991
32361057556,0.0253333333333333,4341.933333333333,109.82733333333334,0.1775619949607971,0.1705816380449409,1380419.1374607098,91245.70498615292,30.4234067389335,63485.65302640829
45629217853,0.0273333333333333,11255.333333333334,37.83866666666667,0.1871713589260297,0.1782641165184267,1265784.8255556792,88351.78082378641,30.81994237694723,61121.81288488873
79827781481,0.0286666666666666,241.2,2036.1873333333335,0.1758887478875903,0.1881438153065235,1417110.0615200887,96646.90619567005,38.2490462962963,59680.38630092573
21439773999,0.0246666666666666,6118.266666666666,78.28,0.1751334555808915,0.1642240049734624,1387461.9684095625,84635.1800729833,30.24625520844684,59036.20751198023
96680767841,0.0293333333333333,1582.3333333333333,315.58133333333336,0.1762583893366516,0.1899799488166605,1468326.4819790302,86778.09508496069,32.35259734569201,58703.12739786161
38700038932,0.0226666666666666,365.26666666666665,1337.1320000000003,0.1785537549379326,0.1787842541271728,1418536.365320908,89509.64465174929,34.81062803431283,58350.77519719366
64403598239,0.0253333333333333,5752.2,78.12666666666668,0.1718802095337848,0.1548209411207793,1323501.879621348,83512.96860410707,30.271728173568377,58232.14975859427
89726005175,0.026,10937.533333333333,41.28066666666666,0.1778540036398852,0.1696551123392901,1346359.9274887978,80916.23164207675,30.303525237256423,56395.76096538315


The **coefficient of variation** is a ratio between the standard deviation and the mean, measuring the relative stability which help us compare merchants with different average revenue. Thus, we will create a weight that favors merchant with higher stability. The weight is calculate as

$$W_{\text{CV}} = \frac{1}{1 + CV}$$

In [28]:
merchant_ranking_metrics = merchant_ranking_metrics.withColumn("risk_adjusted_epv", F.col("risk_adjusted_epv") * (1/(1 + F.col("coef_of_variation") )))
merchant_ranking_metrics.orderBy("risk_adjusted_epv", ascending = False).limit(10)

                                                                                

merchant_abn,risk_adjusted_epv
86578477987,46576.97189271249
48534649627,46425.3603313313
32361057556,45783.39878360842
45629217853,43368.00117279411
79827781481,43161.76107724143
21439773999,42750.78259761917
96680767841,42428.313675071
64403598239,42402.98040334261
38700038932,42009.531508133165
89726005175,40650.27950950668


Now that we have found the adjusted EPV that accounts for different factor, we can now merge the merchants to their respective segments and will select top 20 merchants from each segnment.

In [29]:
# Reading in the segmented merchants
segments = spark.read.parquet(f"../data/curated/segmented_merchants_info.parquet/")
segments = segments.select("name", "merchant_abn", "segments")

complete_ranking = merchant_ranking_metrics.select("merchant_abn", "risk_adjusted_epv")

# Merge ranking with segments
complete_ranking = complete_ranking.join(segments, on = 'merchant_abn', how='inner')
complete_ranking

                                                                                

merchant_abn,risk_adjusted_epv,name,segments
73256306726,4253.383465153581,Id LLP,"Fashion, Personal..."
73841664453,3.971641122407166...,Lacinia At LLP,"Books, Media, Art..."
83412691377,1207.9882262976503,Suspendisse Sagit...,"Fashion, Personal..."
92202115241,9.960598276188678...,Fames Ac Turpis L...,"Books, Media, Art..."
96946925998,6.694706058513072...,Nisi Cum Corporation,"Books, Media, Art..."
64185141673,1.463940815494221...,Maecenas Corp.,"Books, Media, Art..."
66610548417,1.926901616965567...,Nulla Inc.,"Vehicles, Repair..."
71002398501,2.333617783831284...,Ipsum Phasellus C...,"Fashion, Personal..."
72762528640,1.260831198080646...,Sit Amet Risus As...,"Vehicles, Repair..."
87211363921,7.773066636813608...,Mauris Non PC,"Books, Media, Art..."


In [30]:
segments = ["Computers, Electronics, and Office Supplies", "Home, Garden, and Furnishings", "Books, Media, Arts, Crafts, and Hobbies", "Fashion, Personal Accessories, Health, and Beauty"
            , " Vehicles, Repairs, and Miscellaneous Services"]

In [40]:
seg1_ranking = complete_ranking.filter(F.col('segments') == segments[0])
seg1_ranking = seg1_ranking.orderBy("risk_adjusted_epv", ascending=False)
seg1_ranking.write.parquet(f"../data/curated/seg1_ranking.parquet", mode='overwrite')
seg1_ranking.limit(20)

                                                                                

merchant_abn,risk_adjusted_epv,name,segments
21439773999,42750.78259761917,Mauris Non Institute,"Computers, Electr..."
82368304209,40328.12623910149,Nec Incorporated,"Computers, Electr..."
35909341340,34528.05883045934,Arcu Sed Eu Incor...,"Computers, Electr..."
45433476494,30376.551287739952,Adipiscing Elit F...,"Computers, Electr..."
58454491168,29997.410374360927,Diam At Foundation,"Computers, Electr..."
94690988633,29332.853997916067,Eu Placerat LLC,"Computers, Electr..."
67400260923,24354.51311460297,Eleifend PC,"Computers, Electr..."
80518954462,23902.54171838429,Neque Sed Dictum ...,"Computers, Electr..."
34096466752,22582.129769355102,Nullam Enim Ltd,"Computers, Electr..."
57757792876,22232.71177159719,Pretium Et LLC,"Computers, Electr..."


In [41]:
seg2_ranking = complete_ranking.filter(F.col('segments') == segments[1])
seg2_ranking = seg2_ranking.orderBy("risk_adjusted_epv", ascending=False)
seg2_ranking.write.parquet(f"../data/curated/seg2_ranking.parquet", mode='overwrite')
seg2_ranking.limit(20)

                                                                                

merchant_abn,risk_adjusted_epv,name,segments
79827781481,43161.76107724143,Amet Risus Inc.,"Home, Garden, and..."
76767266140,33825.78192692432,Phasellus At Limited,"Home, Garden, and..."
43186523025,31371.03747319446,Lorem Ipsum Sodal...,"Home, Garden, and..."
49212265466,27448.785192182742,Auctor Company,"Home, Garden, and..."
21772962346,24870.696995250284,Purus Gravida Sag...,"Home, Garden, and..."
38090089066,23022.567284653695,Interdum Feugiat ...,"Home, Garden, and..."
42355028515,22642.576703767238,Eu Inc.,"Home, Garden, and..."
76314317957,22194.568063323582,Semper Corp.,"Home, Garden, and..."
24852446429,19697.71110190194,Erat Vitae LLP,"Home, Garden, and..."
90543168331,17474.217277567357,Phasellus Dapibus...,"Home, Garden, and..."


In [42]:
seg3_ranking = complete_ranking.filter(F.col('segments') == segments[2])
seg3_ranking = seg3_ranking.orderBy("risk_adjusted_epv", ascending=False)
seg3_ranking.write.parquet(f"../data/curated/seg3_ranking.parquet", mode='overwrite')
seg3_ranking.limit(20)

                                                                                

merchant_abn,risk_adjusted_epv,name,segments
32361057556,45783.39878360842,Orci In Consequat...,"Books, Media, Art..."
45629217853,43368.00117279411,Lacus Consulting,"Books, Media, Art..."
64403598239,42402.98040334261,Lobortis Ultrices...,"Books, Media, Art..."
94493496784,38198.29662671848,Dictum Phasellus ...,"Books, Media, Art..."
63123845164,38078.78242188081,Odio Phasellus In...,"Books, Media, Art..."
72472909171,34449.29025506108,Nullam Consulting,"Books, Media, Art..."
40515428545,34194.52231224272,Elit Sed Consequa...,"Books, Media, Art..."
79417999332,34145.09500900829,Phasellus At Company,"Books, Media, Art..."
27326652377,33018.039561383506,Tellus Aenean Cor...,"Books, Media, Art..."
98973094975,30865.91747188633,Ornare Fusce Inc.,"Books, Media, Art..."


In [43]:
seg4_ranking = complete_ranking.filter(F.col('segments') == segments[3])
seg4_ranking = seg4_ranking.orderBy("risk_adjusted_epv", ascending=False)
seg4_ranking.write.parquet(f"../data/curated/seg4_ranking.parquet", mode='overwrite')
seg4_ranking.limit(20)

                                                                                

merchant_abn,risk_adjusted_epv,name,segments
86578477987,46576.97189271249,Leo In Consulting,"Fashion, Personal..."
48534649627,46425.3603313313,Dignissim Maecena...,"Fashion, Personal..."
49322182190,38283.06229337885,Gravida Mauris In...,"Fashion, Personal..."
93558142492,23960.488332339733,Dolor Quisque Inc.,"Fashion, Personal..."
11439466003,20536.185189376087,Blandit At LLC,"Fashion, Personal..."
95574756848,18355.45043030973,At Pede Inc.,"Fashion, Personal..."
99976658299,16365.648665088898,Sociosqu Corp.,"Fashion, Personal..."
62224020443,16145.616793403386,Hendrerit A Corpo...,"Fashion, Personal..."
46804135891,16081.874104360708,Suspendisse Dui C...,"Fashion, Personal..."
81761494572,15028.519980407316,Nulla Facilisis I...,"Fashion, Personal..."


In [44]:
seg5_ranking = complete_ranking.filter(F.col('segments') == segments[4])
seg5_ranking = seg5_ranking.orderBy("risk_adjusted_epv", ascending=False)
seg5_ranking.write.parquet(f"../data/curated/seg5_ranking.parquet", mode='overwrite')
seg5_ranking.limit(20)

                                                                                

merchant_abn,risk_adjusted_epv,name,segments
96680767841,42428.313675071,Ornare Limited,"Vehicles, Repair..."
38700038932,42009.531508133165,Etiam Bibendum In...,"Vehicles, Repair..."
89726005175,40650.27950950668,Est Nunc Consulting,"Vehicles, Repair..."
80551528183,32810.426156641195,Ac Ipsum LLC,"Vehicles, Repair..."
49891706470,31071.28807610402,Non Vestibulum In...,"Vehicles, Repair..."
90568944804,30239.13623574025,Diam Eu Dolor LLC,"Vehicles, Repair..."
13514558491,30086.23184009224,Magna Praesent PC,"Vehicles, Repair..."
75454398468,27520.90543913781,Tempus Non Lacini...,"Vehicles, Repair..."
68559320474,25433.71596322629,Aliquam Auctor As...,"Vehicles, Repair..."
49549583265,24951.07755094422,Luctus Et Incorpo...,"Vehicles, Repair..."
