In [66]:
from pyspark.sql import functions as F, SparkSession, Window
import pandas as pd

In [67]:
#  Create a Spark Session
spark = (
    SparkSession.builder.appName("Ranking Model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "2g")
    .getOrCreate()
)

# Feature engineering

In [68]:
transactions = spark.read.parquet(f"../data/curated/transactions.parquet")
transactions = transactions.withColumn("order_month", F.date_format(F.col("order_datetime"), "yyyy-MM"))

We think that the forecasted revenue of each month for each merchant will be useful in determining if we should onboard the merchant to the BNPL firm as it's an indication of future growth. The growth in number of orders each month might also be important.

In [69]:
# Aggregate the data into desired format
agg_transactions = transactions.groupBy("merchant_abn", "order_month",).agg(
    F.count(F.col("order_id")).alias("num_orders"),
    F.round(F.sum('dollar_value'), 2).alias('revenue'),
    F.round(F.mean(F.col("dollar_value")),2).alias("revenue_per_order")
)
agg_transactions = agg_transactions.orderBy(["merchant_abn", "order_month"], ascending = [False, True])

# Partition the data based on merchant ABN and compute lag variables for each specific partition
merchant_partition = Window.partitionBy("merchant_abn").orderBy("order_month")
agg_transactions = agg_transactions.withColumn("revenue_lag", F.lag("revenue").over(merchant_partition))
agg_transactions = agg_transactions.withColumn("revenue_growth",
                                               F.when(F.col("revenue_lag").isNotNull(), F.round((F.col("revenue") - F.col("revenue_lag"))/F.col("revenue_lag"), 2)).otherwise(None))

We originally planned to use features such as firm value's using the Discounted Cash Flow model - assuming revenue is cash flow - and the Internal Rate of Return model. However, some merchants don't have consistent sales periods i.e. some sales periods are missing for some merchants. This situation would make the features unreliable and the comparision between merchants harder. Thus, we decide to assume that the missing sales records for periods are totally due to the merchant unable to make any sales during those period and is not because of data entry errors. We understand that there might be some merchants that only do business during certain period but looking from a perspective of a business, it's better to have merchants that have consistent sales.

In [70]:
# Create a dataframe that store the valid date range of the data
months = pd.date_range(start="2021-05-01", end="2022-08-31", freq='MS').strftime('%Y-%m').tolist()
months_df = spark.createDataFrame([(month,) for month in months], ["order_month"])

# Join with the transactions dataframe
transactions_correct_months = transactions.join(months_df, on= ['order_month'], how = 'right')

# Get the list of merchants with complete sales records
complete_merchants = transactions_correct_months.groupBy("merchant_abn").agg(F.countDistinct("order_month")\
                                                                             .alias("month_count")).filter(F.col("month_count") == len(months)).select("merchant_abn")
print(f"Number of merchants with complete sales records: {complete_merchants.count()}")

# Select the merchants with complete sales records from the aggregated sales 
print(f"Number of entries before removing merchants with missing sales records: {agg_transactions.count()}")
agg_transactions = agg_transactions.join(complete_merchants, on='merchant_abn', how='inner')
print(f"Number of entries after removing merchants with missing sales records: {agg_transactions.count()}")

24/09/28 15:35:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 15:35:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/09/28 15:35:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

Number of merchants with complete sales records: 3194


                                                                                

Number of entries before removing merchants with missing sales records: 69251


                                                                                

Number of entries after removing merchants with missing sales records: 59329
