# Modelling: monthly revenue
In this notebook, we aim to model an additional feature for the future revenue of merchants based on the most recent 12 months of transactions. We test Linear Regression and Random Forest models, accounting for monthly seasonal changes as well as a variable model parameters for merchants in different revenue bands.

This becomes one of the 5 key metrics used to rank merchants for the BNPL scheme (predicted revenue growth rate).

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.regression import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Create a spark session
spark = (
    SparkSession.builder.appName("Modelling monthly revenue")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "16G")
    .config("spark.executor.memory", "16G")
    .config("spark.sql.files.maxPartitionBytes", "64MB")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.network.timeout", "600s")
    .getOrCreate()
)

In [3]:
transactions = spark.read.parquet('../data/curated/all_details/*')

In [None]:
# Group by month and merchant, aggregating the sum of dollar_value and the count of transactions
merchant_monthly = transactions.groupBy(F.date_format(F.col('order_datetime'), 'yyyy-MM'), 'merchant_abn').agg(F.sum('dollar_value'))
count_transactions = transactions.groupBy(F.date_format(F.col('order_datetime'), 'yyyy-MM'), 'merchant_abn').agg(F.count('dollar_value'))
merchant_monthly = merchant_monthly.join(count_transactions, ['date_format(order_datetime, yyyy-MM)', 'merchant_abn'], how='inner')
merchant_monthly = merchant_monthly.withColumnRenamed('sum(dollar_value)', 'monthly_revenue')
merchant_monthly = merchant_monthly.withColumnRenamed('count(dollar_value)', 'number_transactions')
merchant_monthly.show()

In [None]:
merchant = spark.read.parquet('../data/curated/part_1/clean_merchant.parquet')
merchant = merchant.withColumn('average_revenue', F.col('sum(dollar_value)') / F.col('count(dollar_value)'))
merchant.show()

In [None]:
merchant_with_revenue = merchant.join(merchant_monthly, on ='merchant_abn', how='left')
omitted_merchants = merchant.join(merchant_monthly, on ='merchant_abn', how='left_anti')
omitted_merchants.count()

In [None]:
merchant_with_revenue = merchant_with_revenue.withColumnRenamed('sum(dollar_value)', 'total_revenue').withColumnRenamed('date_format(order_datetime, yyyy-MM)', 'month_year')
merchant_with_revenue.show(5)

In [8]:
# Creating a month column
merchant_with_revenue = merchant_with_revenue.withColumn('month', F.month(F.col('month_year')))

In [None]:
merchant_with_revenue.show(5)

In [10]:
# importing window 
from pyspark.sql.window import Window

# Creating a month since first transaction column for each merchant
merchant_with_revenue = merchant_with_revenue.withColumn('first_transaction', F.min('month_year').over(Window.partitionBy('merchant_abn')))
merchant_with_revenue = merchant_with_revenue.withColumn('month_since_first_transaction', F.months_between(F.col('month_year'), F.col('first_transaction')))

In [None]:
merchant_with_revenue.show(5)

In [12]:
# Creating montly earning column
merchant_with_revenue = merchant_with_revenue.withColumn('monthly_earning', F.col('monthly_revenue') * F.col('take_rate')/100)

In [None]:
merchant_with_revenue.show(5)

In [17]:
# Creating sin and cos columns for month
merchant_with_revenue = merchant_with_revenue.withColumn('month_sin', F.sin(2 * 3.14 * F.col('month') / 12))
merchant_with_revenue = merchant_with_revenue.withColumn('month_cos', F.cos(2 * 3.14 * F.col('month') / 12))

In [None]:
# Correlation analysis

# Check numerical columns
numerical_columns = ['take_rate', 'total_revenue', 'count(dollar_value)', 'log_ratio', 'unscaled_earning', 'average_revenue', 'monthly_revenue', 'number_transactions', 'month_since_first_transaction', 'monthly_earning']

# Convert the Spark DataFrame to a Pandas DataFrame for correlation analysis
merchant_with_revenue_pd = merchant_with_revenue.select(numerical_columns).toPandas()

# Calculate the correlation matrix for the numerical columns
corr_df = merchant_with_revenue_pd.corr()

# Plot the heatmap using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Features Correlation Heatmap')
plt.savefig("../plots/Features_Correlation_Heatmap.png")
plt.show()

In [26]:
# Partitioning the data into training and testing
train = merchant_with_revenue.filter(F.col('month_year') < '2022-07')
test = merchant_with_revenue.filter(F.col('month_year') >= '2022-07')

revenue_levels = ['a', 'b', 'c', 'd', 'e']

# Setting up the pipeline for linear regression
assembler = VectorAssembler(inputCols=[ 'month_since_first_transaction', 'total_revenue', 'count(dollar_value)', 'month_sin', 'month_cos'], outputCol='features')
lr = LinearRegression(featuresCol='features', labelCol='monthly_revenue', elasticNetParam=1)

# Create the pipeline
pipeline = Pipeline(stages=[assembler, lr])

# Define the parameter grid for tuning
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

# Define the evaluator 
evaluator = RegressionEvaluator(labelCol='monthly_revenue', predictionCol='prediction', metricName='rmse')

# Fit the model using CrossValidator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

In [None]:
models = {}

for level in revenue_levels:
    # Filter data for the current revenue level
    level_data = train.filter(F.col('revenue_level') == level)
    # Train the model for the revenue level
    model = crossval.fit(level_data)
    # Store the model
    models[level] = model
    # save the model to disk
    model.bestModel.save(f'../models/revenue_linear_regression_{level}')

In [None]:
for level in revenue_levels:
    # load the model
    best_model = PipelineModel.load(f'../models/revenue_linear_regression_{level}')
    test_data = test.filter(F.col('revenue_level') == level)
    predictions = best_model.transform(test_data)
    
    # Apply the inverse log (exponentiate) to get the actual predictions for monthly revenue
    predictions = predictions.withColumn('predicted_monthly_revenue', F.exp(F.col('prediction')))
    
    rmse = evaluator.evaluate(predictions)
    print(f"RMSE for revenue level {level}: {rmse}")

In [29]:
# Similar random forest model but for log-transformed revenue
pipeline_rf = Pipeline(stages=[assembler, RandomForestRegressor(featuresCol='features', labelCol='monthly_revenue')])
paramGrid_rf = ParamGridBuilder() \
    .addGrid(RandomForestRegressor.numTrees, [10, 20]) \
    .addGrid(RandomForestRegressor.maxDepth, [5, 10]) \
    .build()

crossval_rf = CrossValidator(estimator=pipeline_rf,
                            estimatorParamMaps=paramGrid_rf,
                            evaluator=evaluator,
                            numFolds=3)

In [None]:
models = {}

for level in revenue_levels:
    # Filter data for the current revenue level
    level_data = train.filter(F.col('revenue_level') == level)
    # Train the model for the revenue level
    model = crossval_rf.fit(level_data)
    # Store the model
    models[level] = model
    # save the model to disk
    model.bestModel.save(f'../models/revenue_random_forest_{level}')

In [None]:
for level in revenue_levels:
    # load the model
    best_model = PipelineModel.load(f'../models/revenue_random_forest_{level}')
    test_data = test.filter(F.col('revenue_level') == level)
    predictions = best_model.transform(test_data)
    
    # Apply the inverse log to get actual monthly revenue predictions
    predictions = predictions.withColumn('predicted_monthly_revenue', F.exp(F.col('prediction')))
    
    rmse = evaluator.evaluate(predictions)
    print(f"RMSE for revenue level {level}: {rmse}")

In [None]:
# Now getting the predictions for all merchants for the next year (2023-01 to 2023-12)
# Total Revenue, take rate and 'count(dollar_value)' i.e. number of total transactions are same.
# We can use the same model for all merchants.
# We will create a new dataframe with all the months from 2023-01 to 2023-12 and all the merchants and then predict the monthly revenue for each merchant for each month.
# We know months since first transaction will be 24 to 35 for january 2023 to december 2023 respectively.

# Create a dataframe with all the months from 2023-01 to 2023-12
months = ['2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06', '2023-07', '2023-08', '2023-09', '2023-10', '2023-11', '2023-12']
months_df = spark.createDataFrame([(month,) for month in months], ['month_year'])

# Adding all merchants to the dataframe
all_merchants = merchant.select('merchant_abn').distinct()
months_df = months_df.crossJoin(all_merchants)

# Adding the total revenue, take rate and count(dollar_value) columns
months_df = months_df.join(merchant, on='merchant_abn', how='left')

# Adding the month column
months_df = months_df.withColumn('month', F.month(F.col('month_year')))
months_df = months_df.withColumn('month_sin', F.sin(2 * 3.14 * F.col('month') / 12))
months_df = months_df.withColumn('month_cos', F.cos(2 * 3.14 * F.col('month') / 12))

# Adding the month since first transaction column
months_df = months_df.withColumn('month_since_first_transaction', 23 + F.col('month'))

months_df.show()

In [35]:
# Renaming the sum(dollar_value) column to total_revenue
months_df = months_df.withColumnRenamed('sum(dollar_value)', 'total_revenue')

We can see that the linear model has a better performance than random forest, so we use it to predict future data.

In [None]:
# We will add a predicted_monthly_revenue column to the dataframe
# We will use the best linear regression model for each revenue level to predict the monthly revenue for each merchant for each month

for level in revenue_levels:
    # Load the model
    best_model = PipelineModel.load(f'../models/revenue_linear_regression_{level}')
    # Predict the monthly revenue for each merchant for each month
    predictions = best_model.transform(months_df)

predictions.show()

In [42]:
# Storing the predictions in a parquet file
predictions.write.mode('overwrite').parquet('../data/curated/predicted_monthly_revenue.parquet')