In [1]:
from pyspark.sql import functions as F, SparkSession, Window
import pandas as pd

In [2]:
#  Create a Spark Session
spark = (
    SparkSession.builder.appName("Ranking Model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
24/09/28 16:18:59 WARN Utils: Your hostname, Skye-Ngu resolves to a loopback address: 127.0.1.1; using 172.17.250.30 instead (on interface eth0)
24/09/28 16:18:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/28 16:19:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Feature engineering

In [3]:
transactions = spark.read.parquet(f"../data/curated/transactions.parquet")
transactions = transactions.withColumn("order_month", F.date_format(F.col("order_datetime"), "yyyy-MM"))

                                                                                

In [14]:
transactions.limit(1)

merchant_abn,order_datetime,consumer_id,dollar_value,order_id,consumer_fp,merchant_fp,name,category,revenue_level,take_rate,order_month
68216911708,2021-08-20,148519,49.86466328799216,df791327-3a6f-40b...,,,Placerat Eget Ven...,"computers, comput...",c,3.05,2021-08


We think that the forecasted revenue of each month for each merchant will be useful in determining if we should onboard the merchant to the BNPL firm as it's an indication of future growth. The growth in number of orders each month might also be important.

In [24]:
# Aggregate the data into desired format
agg_transactions = transactions.groupBy("merchant_abn", "order_month", "take_rate").agg(
    F.count(F.col("order_id")).alias("num_orders"),
    F.round(F.sum('dollar_value'), 2).alias('revenue'),
    F.round(F.mean(F.col("dollar_value")),2).alias("revenue_per_order")
)
agg_transactions = agg_transactions.orderBy(["merchant_abn", "order_month"], ascending = [False, True])

# Partition the data based on merchant ABN and compute lag variables for each specific partition
merchant_partition = Window.partitionBy("merchant_abn").orderBy("order_month")
agg_transactions = agg_transactions.withColumns({
    "revenue_lag": F.lag("revenue").over(merchant_partition),
    "expected_profit": F.round(F.col("revenue") * F.col("take_rate")/100,2)
    })
agg_transactions = agg_transactions.withColumn("revenue_growth",
                                               F.when(F.col("revenue_lag").isNotNull(), F.round((F.col("revenue") - F.col("revenue_lag"))/F.col("revenue_lag"), 2)).otherwise(None))

We originally planned to use features such as firm value's using the Discounted Cash Flow model - assuming revenue is cash flow - and the Internal Rate of Return model. However, some merchants don't have consistent sales periods i.e. some sales periods are missing for some merchants. This situation would make the features unreliable and the comparision between merchants harder. Thus, we decide to assume that the missing sales records for periods are totally due to the merchant unable to make any sales during those period and is not because of data entry errors. We understand that there might be some merchants that only do business during certain period but looking from a perspective of a business, it's better to have merchants that have consistent sales.

In [26]:
# Create a dataframe that store the valid date range of the data
months = pd.date_range(start="2021-05-01", end="2022-08-31", freq='MS').strftime('%Y-%m').tolist()
months_df = spark.createDataFrame([(month,) for month in months], ["order_month"])

# Join with the transactions dataframe
transactions_correct_months = transactions.join(months_df, on= ['order_month'], how = 'right')

# Get the list of merchants with complete sales records
complete_merchants = transactions_correct_months.groupBy("merchant_abn").agg(F.countDistinct("order_month")\
                                                                             .alias("month_count")).filter(F.col("month_count") == len(months)).select("merchant_abn")
print(f"Number of merchants with complete sales records: {complete_merchants.count()}")

# Select the merchants with complete sales records from the aggregated sales 
print(f"Number of entries before removing merchants with missing sales records: {agg_transactions.count()}")
agg_transactions = agg_transactions.join(complete_merchants, on='merchant_abn', how='inner')
agg_transactions = agg_transactions.filter("2021-05" <= F.col("order_month")).orderBy(['merchant_abn', 'order_month'],
                                                                                      ascending = [True, True])
print(f"Number of entries after removing merchants with missing sales records: {agg_transactions.count()}")

24/09/28 17:23:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:23:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:23:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

Number of merchants with complete sales records: 3194


                                                                                

Number of entries before removing merchants with missing sales records: 67331


24/09/28 17:23:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:23:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:23:58 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Number of entries after removing merchants with missing sales records: 51104


                                                                                

In [27]:
agg_transactions

24/09/28 17:38:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:38:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:38:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:38:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:38:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 17:38:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

merchant_abn,order_month,take_rate,num_orders,revenue,revenue_per_order,revenue_lag,expected_profit,revenue_growth
10023283211,2021-05,0.18,144,30111.88,209.11,27622.34,54.2,0.09
10023283211,2021-06,0.18,145,28790.43,198.55,30111.88,51.82,-0.04
10023283211,2021-07,0.18,153,29430.73,192.36,28790.43,52.98,0.02
10023283211,2021-08,0.18,146,32118.49,219.99,29430.73,57.81,0.09
10023283211,2021-09,0.18,175,34755.78,198.6,32118.49,62.56,0.08
10023283211,2021-10,0.18,169,39993.29,236.65,34755.78,71.99,0.15
10023283211,2021-11,0.18,217,48435.09,223.2,39993.29,87.18,0.21
10023283211,2021-12,0.18,211,45068.95,213.6,48435.09,81.12,-0.07
10023283211,2022-01,0.18,128,29431.27,229.93,45068.95,52.98,-0.35
10023283211,2022-02,0.18,134,31489.46,235.0,29431.27,56.68,0.07


In [12]:
merchant_avg_growth = agg_transactions.groupBy("merchant_abn").agg(
    F.round(F.mean("revenue_growth"),2).alias("avg_growth")
).orderBy("merchant_abn", ascending=True)