In [2]:
from pyspark.sql import functions as F, SparkSession, Window
import pandas as pd

In [3]:
#  Create a Spark Session
spark = (
    SparkSession.builder.appName("Ranking Model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

24/10/01 13:59:42 WARN Utils: Your hostname, Alistairs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.13.39.131 instead (on interface en0)
24/10/01 13:59:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/01 13:59:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Feature engineering

In [4]:
transactions = spark.read.parquet(f"../data/curated/transactions.parquet")
transactions = transactions.withColumn("order_month", F.date_format(F.col("order_datetime"), "yyyy-MM"))

                                                                                

In [5]:
transactions.limit(1)

                                                                                

merchant_abn,dollar_value,order_id,order_datetime,consumer_id,order_month
79417999332,136.06570809815838,23acbb7b-cf98-458...,2021-11-26,1059280,2021-11


We think that the forecasted revenue of each month for each merchant will be useful in determining if we should onboard the merchant to the BNPL firm as it's an indication of future growth. The growth in number of orders each month might also be important.

In [None]:
""""
# Aggregate the data into desired format
agg_transactions = transactions.groupBy("merchant_abn", "order_month", "take_rate").agg(
    F.count(F.col("order_id")).alias("num_orders"),
    F.round(F.sum('dollar_value'), 2).alias('revenue'),
    F.round(F.mean(F.col("dollar_value")),2).alias("revenue_per_order")
)
agg_transactions = agg_transactions.orderBy(["merchant_abn", "order_month"], ascending = [False, True])

# Partition the data based on merchant ABN and compute lag variables for each specific partition
merchant_partition = Window.partitionBy("merchant_abn").orderBy("order_month")
agg_transactions = agg_transactions.withColumns({
    "revenue_lag": F.lag("revenue").over(merchant_partition),
    "expected_profit": F.round(F.col("revenue") * F.col("take_rate")/100,2)
    })
agg_transactions = agg_transactions.withColumn("revenue_growth",
                                               F.when(F.col("revenue_lag").isNotNull(), F.round((F.col("revenue") - F.col("revenue_lag"))/F.col("revenue_lag"), 2)).otherwise(None))
"""

We originally planned to use features such as firm value's using the Discounted Cash Flow model - assuming revenue is cash flow - and the Internal Rate of Return model. However, some merchants don't have consistent sales periods i.e. some sales periods are missing for some merchants. This situation would make the features unreliable and the comparision between merchants harder. Thus, we decide to assume that the missing sales records for periods are totally due to the merchant unable to make any sales during those period and is not because of data entry errors. We understand that there might be some merchants that only do business during certain period but looking from a perspective of a business, it's better to have merchants that have consistent sales.

In [6]:
# Create a dataframe that store the valid date range of the data
months = pd.date_range(start="2021-05-01", end="2022-08-31", freq='MS').strftime('%Y-%m').tolist()
months_df = spark.createDataFrame([(month,) for month in months], ["order_month"])

# Join with the transactions dataframe
transactions_correct_months = transactions.join(months_df, on= ['order_month'], how = 'right')

# Get the list of merchants with complete sales records
complete_merchants = transactions_correct_months.groupBy("merchant_abn").agg(F.countDistinct("order_month")\
                                                                             .alias("month_count")).filter(F.col("month_count") == len(months)).select("merchant_abn")
print(f"Number of merchants with complete sales records: {complete_merchants.count()}")

# Select the merchants with complete sales records from the aggregated sales 
print(f"Number of entries before removing merchants with missing sales records: {agg_transactions.count()}")
agg_transactions = agg_transactions.join(complete_merchants, on='merchant_abn', how='inner')
agg_transactions = agg_transactions.filter("2021-05" <= F.col("order_month")).orderBy(['merchant_abn', 'order_month'],
                                                                                      ascending = [True, True])
print(f"Number of entries after removing merchants with missing sales records: {agg_transactions.count()}")

24/09/30 22:03:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:03:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:03:34 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

Number of merchants with complete sales records: 3194


                                                                                

Number of entries before removing merchants with missing sales records: 67331


                                                                                

Number of entries after removing merchants with missing sales records: 51104


In [7]:
agg_transactions

24/09/30 22:04:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:04:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:04:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:04:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:04:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:04:17 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

merchant_abn,order_month,take_rate,num_orders,revenue,revenue_per_order,revenue_lag,expected_profit,revenue_growth
10023283211,2021-05,0.18,144,30111.88,209.11,27622.34,54.2,0.09
10023283211,2021-06,0.18,145,28790.43,198.55,30111.88,51.82,-0.04
10023283211,2021-07,0.18,153,29430.73,192.36,28790.43,52.98,0.02
10023283211,2021-08,0.18,146,32118.49,219.99,29430.73,57.81,0.09
10023283211,2021-09,0.18,175,34755.78,198.6,32118.49,62.56,0.08
10023283211,2021-10,0.18,169,39993.29,236.65,34755.78,71.99,0.15
10023283211,2021-11,0.18,217,48435.09,223.2,39993.29,87.18,0.21
10023283211,2021-12,0.18,211,45068.95,213.6,48435.09,81.12,-0.07
10023283211,2022-01,0.18,128,29431.27,229.93,45068.95,52.98,-0.35
10023283211,2022-02,0.18,134,31489.46,235.0,29431.27,56.68,0.07


In [10]:
# Does the mean/average take into account some months may have nothing?

merchant_avg_growth = agg_transactions.groupBy("merchant_abn").agg(
    F.round(F.mean("revenue_growth"),2).alias("avg_growth"),
    F.round(F.mean("expected_profit"), 2).alias('avg_monthly_expected_profit'),
    F.round(F.mean("revenue_per_order"),2).alias('avg_monthly_revenue_per_order'),
    F.round(F.mean('num_orders'), 2).alias('avg_monthly_num_order')
).orderBy("merchant_abn", ascending=True)



In [11]:
merchant_avg_growth

24/09/30 22:07:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:07:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/30 22:07:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

merchant_abn,avg_growth,avg_monthly_expected_profit,avg_monthly_revenue_per_order,avg_monthly_num_order
10023283211,0.02,63.71,216.01,164.06
10142254217,0.05,248.76,40.01,147.56
10187291046,0.05,64.28,120.95,16.38
10192359162,0.19,534.56,444.36,18.88
10206519221,0.03,1129.8,37.85,469.31
10255988167,0.12,702.47,389.96,41.63
10264435225,0.04,700.22,116.12,251.75
10279061213,0.07,481.16,297.37,28.31
10323485998,0.04,4453.99,128.58,524.13
10342410215,0.02,1045.22,386.85,43.0


In [11]:
transactions.show(5)

+------------+------------------+--------------------+--------------+-----------+-----------+
|merchant_abn|      dollar_value|            order_id|order_datetime|consumer_id|order_month|
+------------+------------------+--------------------+--------------+-----------+-----------+
| 79417999332|136.06570809815838|23acbb7b-cf98-458...|    2021-11-26|    1059280|    2021-11|
| 46451548968| 72.61581642788431|76bab304-fa2d-400...|    2021-11-26|    1195503|    2021-11|
| 89518629617|3.0783487174439297|a2ae446a-2959-41c...|    2021-11-26|     986886|    2021-11|
| 49167531725| 51.58228625503599|7080c274-17f7-4cc...|    2021-11-26|    1195503|    2021-11|
| 31101120643|25.228114942417797|8e301c0f-06ab-45c...|    2021-11-26|     986886|    2021-11|
+------------+------------------+--------------------+--------------+-----------+-----------+
only showing top 5 rows



In [8]:
predicted_merchant = spark.read.parquet(f"../data/curated/predicted_merchant_fp.parquet")
predicted_consumer = spark.read.parquet(f"../data/curated/predicted_consumer_fp.parquet")

In [10]:
predicted_merchant.show(10)
predicted_consumer.show(10)

+------------+--------------+--------------------+------------------+
|merchant_abn|order_datetime|            order_id|       merchant_fp|
+------------+--------------+--------------------+------------------+
| 10142254217|    2021-03-23|b4608467-70ce-484...| 38.71618853592903|
| 10142254217|    2021-03-14|1f43bba4-61b9-4ab...|35.834193702349424|
| 10142254217|    2021-03-03|ae0d4691-9903-487...|41.202888415759254|
| 10142254217|    2021-03-19|425de612-7e74-4bd...| 38.20976428453193|
| 10142254217|    2021-03-10|cf5c2ae7-0ef0-446...|41.202888415759254|
| 10142254217|    2021-03-25|7b0ee883-758a-45b...|40.156302277471426|
| 10142254217|    2021-03-20|e041a987-14ce-486...|39.847837391341116|
| 10142254217|    2021-03-29|f5083c2c-5bce-497...| 38.50966065987536|
| 10142254217|    2021-03-20|e4ea44af-7a00-408...| 36.35132544975467|
| 10142254217|    2021-03-20|93649919-82a6-440...|39.847837391341116|
+------------+--------------+--------------------+------------------+
only showing top 10 

In [15]:
path = "../data/curated"

# Read cleaned consumer data in
consumer_fp = spark.read.parquet(f"{path}/consumer_fp.parquet")
consumer_info = spark.read.parquet(f"{path}/consumer_info.parquet")

# Merge the dataset
print(f"Number of entries before the merge: {consumer_fp.count()}")
consumer_fp = consumer_fp.join(consumer_info, on = ['consumer_id'], how = 'inner').select('consumer_id', 'order_datetime', 'fraud_probability')
print(f'Number of entries after the merge: {consumer_fp.count()}')


# Read merchant datasets in
merchant_info = spark.read.parquet(f"{path}/merchant_info.parquet")
merchant_fp = spark.read.parquet(f"{path}/merchant_fp.parquet")

# Merge the dataset
print(f"Number of entries before the merge: {merchant_fp.count()}")
merchant_fp = merchant_fp.join(merchant_info, on = ['merchant_abn'], how = 'inner').select('merchant_abn', 'order_datetime', 'fraud_probability')
print(f'Number of entries after the merge: {merchant_fp.count()}')

Number of entries before the merge: 34864
Number of entries after the merge: 34864
Number of entries before the merge: 114
Number of entries after the merge: 95


In [21]:
consumer_info.show(5)
merchant_info.show(5)

+--------------------+-----------+------+-----+--------+
|                name|consumer_id|gender|state|postcode|
+--------------------+-----------+------+-----+--------+
|     Courtney Mendez|     680810|Female|  QLD|    9013|
|          John Garza|     531402|  Male|   SA|    5301|
|          Adam Bowen|    1440589|  Male|   SA|    5201|
|     Jeffrey Harding|     699376|  Male|  VIC|    8205|
|Mr. Antonio Peterson|     829756|  Male|  QLD|    4075|
+--------------------+-----------+------+-----+--------+
only showing top 5 rows

+--------------------+------------+--------------------+-------------+---------+
|                name|merchant_abn|            category|revenue_level|take_rate|
+--------------------+------------+--------------------+-------------+---------+
|       Felis Limited| 10023283211|furniture, home f...|            e|     0.18|
|Arcu Ac Orci Corp...| 10142254217|cable, satellite,...|            b|     4.22|
|    Nunc Sed Company| 10165489824|jewelry, watch, c...| 

In [20]:
consumer_fp.show(10)
merchant_fp.show(10)

+-----------+--------------+------------------+
|consumer_id|order_datetime| fraud_probability|
+-----------+--------------+------------------+
|    1195503|    2022-02-20| 9.805431136520959|
|     179208|    2021-09-25|10.069850934775245|
|     179208|    2021-08-30| 9.599513915425788|
|    1194530|    2021-11-03| 8.300636455314633|
|     154128|    2021-10-09| 9.633302411090419|
|     712975|    2022-01-11|27.496186536467164|
|     712975|    2021-10-04|10.868364868449886|
|     712975|    2022-02-08|  9.02022421158597|
|     407340|    2021-12-12|10.459280127078758|
|     650435|    2021-12-13| 10.58055311139687|
+-----------+--------------+------------------+
only showing top 10 rows

+------------+--------------+------------------+
|merchant_abn|order_datetime| fraud_probability|
+------------+--------------+------------------+
| 11149063370|    2022-02-25| 51.01538421455241|
| 11149063370|    2021-11-14|52.407803322764764|
| 11149063370|    2021-08-28| 56.43761254995139|
| 114709

In [22]:
print(f'Number of rows before the merge: {transactions.count()}')

# join with consumer fraud prob and rename the col
transactions_df = transactions.join(consumer_fp, on = ['consumer_id', 'order_datetime'], how='left')
transactions_df = transactions_df.withColumnRenamed("fraud_probability", "consumer_fp")
print(f'Number of rows after the merge with consumer fraud probability: {transactions_df.count()}')

# Join with merchant fraud prob and rename the col
transactions_df = transactions_df.join(merchant_fp, on = ['merchant_abn', 'order_datetime'], how='left')
transactions_df = transactions_df.withColumnRenamed("fraud_probability", "merchant_fp")
print(f'Number of rows after the merge with merchant fraud probability: {transactions_df.count()}')

transactions_df = transactions_df.join(merchant_info, on = ['merchant_abn'], how='inner')
print(f'Number of rows after the merge with merchant information: {transactions_df.count()}')

Number of rows before the merge: 12561377


                                                                                

Number of rows after the merge with consumer fraud probability: 12561589


                                                                                

Number of rows after the merge with merchant fraud probability: 12561589




Number of rows after the merge with merchant information: 12047496


                                                                                

In [23]:
transactions_df.show(5)

                                                                                

+------------+--------------+-----------+------------------+--------------------+-----------+-----------+-----------+--------------------+--------------------+-------------+---------+
|merchant_abn|order_datetime|consumer_id|      dollar_value|            order_id|order_month|consumer_fp|merchant_fp|                name|            category|revenue_level|take_rate|
+------------+--------------+-----------+------------------+--------------------+-----------+-----------+-----------+--------------------+--------------------+-------------+---------+
| 24015173965|    2021-11-26|     154128|             157.0|5b867ac9-d1e0-430...|    2021-11|       NULL|       NULL|      Lectus Limited|cable, satellite,...|            a|     6.79|
| 98973094975|    2021-11-26|     289140|138.76652355027088|4a9b8513-91ef-4a1...|    2021-11|       NULL|       NULL|   Ornare Fusce Inc.|hobby, toy and ga...|            a|     5.98|
| 56796971172|    2021-11-26|    1221456|238.56434228408926|f769a07a-88f2-43d...

In [30]:
# Aggregate the data into desired format
agg_transactions = transactions_df.groupBy("merchant_abn", "order_month", "take_rate").agg(
    F.count(F.col("order_id")).alias("num_orders"),
    F.round(F.sum('dollar_value'), 2).alias('revenue'),
    F.round(F.mean(F.col("dollar_value")),2).alias("revenue_per_order"),
    F.first(F.col("merchant_fp")).alias("merchant_fp"),
    F.first(F.col("consumer_fp")).alias("consumer_fp"),
    F.first(F.col("order_id")).alias("order_id")
)
agg_transactions = agg_transactions.orderBy(["merchant_abn", "order_month"], ascending = [False, True])

# Partition the data based on merchant ABN and compute lag variables for each specific partition
merchant_partition = Window.partitionBy("merchant_abn").orderBy("order_month")
agg_transactions = agg_transactions.withColumns({
    "revenue_lag": F.lag("revenue").over(merchant_partition),
    "expected_profit": F.round(F.col("revenue") * F.col("take_rate")/100,2)
    })
agg_transactions = agg_transactions.withColumn("revenue_growth",
                                               F.when(F.col("revenue_lag").isNotNull(), F.round((F.col("revenue") - F.col("revenue_lag"))/F.col("revenue_lag"), 2)).otherwise(None))

In [29]:
agg_transactions.show(5)

                                                                                

+------------+-----------+---------+----------+--------+-----------------+-----------+-----------+-----------+---------------+--------------+
|merchant_abn|order_month|take_rate|num_orders| revenue|revenue_per_order|merchant_fp|consumer_fp|revenue_lag|expected_profit|revenue_growth|
+------------+-----------+---------+----------+--------+-----------------+-----------+-----------+-----------+---------------+--------------+
| 10023283211|    2021-02|     0.18|         3|  701.57|           233.86|       NULL|       NULL|       NULL|           1.26|          NULL|
| 10023283211|    2021-03|     0.18|       111|24634.35|           221.93|       NULL|       NULL|     701.57|          44.34|         34.11|
| 10023283211|    2021-04|     0.18|       129|27622.34|           214.13|       NULL|       NULL|   24634.35|          49.72|          0.12|
| 10023283211|    2021-05|     0.18|       144|30111.88|           209.11|       NULL|       NULL|   27622.34|           54.2|          0.09|
| 1002

In [35]:
from pyspark.sql import functions as F


# Select and rename columns in `predicted_merchant` before joining
predicted_merchant = predicted_merchant.select(
    "order_id", 
    F.col("merchant_fp").alias("predicted_merchant_fp")
)

# Select and rename columns in `predicted_consumer` before joining
predicted_consumer = predicted_consumer.select(
    "order_id", 
    F.col("consumer_fp").alias("predicted_consumer_fp")
)


# Join `agg_transactions` with `predicted_merchant` to get predicted merchant fraud probabilities
agg_transactions_with_merchant_fp = agg_transactions.join(
    predicted_merchant, 
    on="order_id", 
    how="left"
)

# Join `agg_transactions_with_merchant_fp` with `predicted_consumer` to get predicted consumer fraud probabilities
agg_transactions_with_fp = agg_transactions_with_merchant_fp.join(
    predicted_consumer, 
    on="order_id", 
    how="left"
)


In [36]:
agg_transactions_with_fp.show(10)

                                                                                

+--------------------+------------+-----------+---------+----------+--------+-----------------+-----------+-----------+-----------+---------------+--------------+---------------------+---------------------+
|            order_id|merchant_abn|order_month|take_rate|num_orders| revenue|revenue_per_order|merchant_fp|consumer_fp|revenue_lag|expected_profit|revenue_growth|predicted_merchant_fp|predicted_consumer_fp|
+--------------------+------------+-----------+---------+----------+--------+-----------------+-----------+-----------+-----------+---------------+--------------+---------------------+---------------------+
|1e731bf4-131e-438...| 10023283211|    2021-06|     0.18|       145|28790.43|           198.55|       NULL|       NULL|   30111.88|          51.82|         -0.04|    53.87780562295093|   12.030415540578806|
|64229c09-4d9d-4e5...| 10023283211|    2021-08|     0.18|       146|32118.49|           219.99|       NULL|       NULL|   29430.73|          57.81|          0.09|   57.6037

In [37]:

# Impute `merchant_fp` with `predicted_merchant_fp` where `merchant_fp` is NULL
agg_transactions_with_fp = agg_transactions_with_fp.withColumn(
    "merchant_fp",
    F.when(
        F.col("merchant_fp").isNull(),
        F.col("predicted_merchant_fp")
    ).otherwise(F.col("merchant_fp"))
)

# Impute `consumer_fp` with `predicted_consumer_fp` where `consumer_fp` is NULL
agg_transactions_with_fp = agg_transactions_with_fp.withColumn(
    "consumer_fp",
    F.when(
        F.col("consumer_fp").isNull(),
        F.col("predicted_consumer_fp")
    ).otherwise(F.col("consumer_fp"))
)

# Drop the temporary columns used for joining if they are no longer needed
agg_transactions_imputed = agg_transactions_with_fp.drop("predicted_merchant_fp", "predicted_consumer_fp")

# Display the resulting DataFrame
agg_transactions_imputed.show(5)

[Stage 532:>                                                        (0 + 1) / 1]

+--------------------+------------+-----------+---------+----------+--------+-----------------+-----------------+------------------+-----------+---------------+--------------+
|            order_id|merchant_abn|order_month|take_rate|num_orders| revenue|revenue_per_order|      merchant_fp|       consumer_fp|revenue_lag|expected_profit|revenue_growth|
+--------------------+------------+-----------+---------+----------+--------+-----------------+-----------------+------------------+-----------+---------------+--------------+
|1e731bf4-131e-438...| 10023283211|    2021-06|     0.18|       145|28790.43|           198.55|53.87780562295093|12.030415540578806|   30111.88|          51.82|         -0.04|
|8dd81a17-1391-4dd...| 10385011947|    2021-09|     3.17|         2| 1383.45|           691.73|48.12942116695931| 13.49231179727444|     971.97|          43.86|          0.42|
|d374c660-c632-433...| 10142254217|    2021-02|     4.22|         1|   10.89|            10.89|             NULL|       

                                                                                

In [None]:
from pyspark.sql import functions as F

# Weights for fraud probabilities
alpha = 0.5  # Weight for merchant fraud probability
beta = 0.5   # Weight for consumer fraud probability

# Weights for final score components
w1 = 0.4  # Weight for risk-adjusted profit (merchant)
w2 = 0.4  # Weight for risk-adjusted profit (consumer)
w3 = 0.1  # Weight for average order volume
w4 = 0.1  # Weight for combined order volume and revenue growth rate

# 1. Calculate Combined Fraud Probability (CFP)
merchant_avg_growth = merchant_avg_growth.withColumn(
    "combined_fraud_probability", 
    alpha * F.col("merchant_fraud_probability") + beta * F.col("consumer_fraud_probability")
)

# 2. Calculate Risk-Adjusted Profit for Merchant Fraud
merchant_avg_growth = merchant_avg_growth.withColumn(
    "risk_adjusted_profit_merchant", 
    F.col("avg_monthly_expected_profit") * (1 - alpha * F.col("merchant_fraud_probability"))
)

# 3. Calculate Risk-Adjusted Profit for Consumer Fraud
merchant_avg_growth = merchant_avg_growth.withColumn(
    "risk_adjusted_profit_consumer", 
    F.col("avg_monthly_expected_profit") * (1 - beta * F.col("consumer_fraud_probability"))
)

# 4. Calculate Combined Risk-Adjusted Profit
merchant_avg_growth = merchant_avg_growth.withColumn(
    "combined_risk_adjusted_profit", 
    F.col("avg_monthly_expected_profit") * (1 - F.col("combined_fraud_probability"))
)

# 5. Calculate Average Order Volume (Activeness Feature)
# Assuming 'avg_monthly_num_order' is the average number of orders per month for each merchant

# 6. Calculate Joint Feature for Order Volume & Revenue Growth Rate
# A simple way to combine order volume and revenue growth could be to multiply the two
merchant_avg_growth = merchant_avg_growth.withColumn(
    "joint_order_volume_revenue_growth",
    F.col("avg_monthly_num_order") * F.col("avg_growth")
)

# 7. Calculate Final Adjusted Score
merchant_avg_growth = merchant_avg_growth.withColumn(
    "adjusted_score",
    w1 * F.col("risk_adjusted_profit_merchant") + 
    w2 * F.col("risk_adjusted_profit_consumer") + 
    w3 * F.col("avg_monthly_num_order") +  # Weight for average order volume
    w4 * F.col("joint_order_volume_revenue_growth")  # Weight for combined order volume and revenue growth
)

# 8. Rank the merchants based on the final adjusted score
merchant_avg_growth = merchant_avg_growth.orderBy(F.col("adjusted_score").desc())

# Display the final ranked DataFrame
merchant_avg_growth.select("merchant_abn", "adjusted_score").show(10)
