# Modelling by Merchant Segment
## Forecasting Transaction Frequency in Florists

In [1]:
# Initialise a spark session
import pandas as pd
from collections import Counter
import os
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


spark = (
    SparkSession.builder.appName("Florist Model")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "16g")  # Increase driver memory
    .config("spark.executor.memory", "16g")  # Increase executor memory
    .config("spark.executor.instances", "4")  # Increase the number of executor instances
    .config("spark.driver.maxResultSize", "4g")
    .config("spark.sql.shuffle.partitions", "100") \
    .getOrCreate()
)

24/09/25 17:20:42 WARN Utils: Your hostname, Melissas-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.3 instead (on interface en0)
24/09/25 17:20:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/25 17:20:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Read transaction file excluding those marked as fraud
transactions = spark.read.parquet('../data/curated/flagged_fraud')
transactions = transactions.filter(F.col("is_fraud") != True) # Exclude Fraud Transactions

## Filter Transactions by Segment (Florists)
Apply lower-casing and Levenshtein distance to account for typos.

In [3]:
from pyspark.sql.functions import lower, col

florist_transactions = transactions.filter(lower(col("merchant_category")).contains("florists"))

florist_transactions.show(truncate=False, n=10)


+------------+---------+-------+------------------+------------------------------------+-----------+--------------------------+------------------+--------------------------------+--------------+-----------------+---------------+----------------------+--------------------------+--------------+----------------+-------------+---------------------------------+-----------------+---------------------------+---------------------------+--------+---------------------------------------------+------------+-----------------+
|merchant_abn|year_week|user_id|dollar_value      |order_id                            |consumer_id|fraud_probability_consumer|name_consumer     |address_consumer                |state_consumer|postcode_consumer|gender_consumer|name_merchant         |fraud_probability_merchant|order_datetime|order_month_year|SA4_CODE_2011|SA4_NAME_2011                    |unemployment_rate|consumer_weekly_transaction|merchant_weekly_transaction|is_fraud|merchant_category                       

## Feature Engineering: Ratio of Returning Customers (Yearly)
* Florists tend to have smaller order values, as indicated by the revenue bands
* Flowers are seasonal
* Thus the heuristic feature of choice would be the ratio of returning customers
* The data is aggregated on a yearly basis to account for the returning customers

In [4]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Step 1: Add a new column for 'order_year' from 'order_datetime'
florist_transactions = florist_transactions.withColumn('order_year', F.year(F.col('order_datetime')))

# Step 2: Calculate total transactions, dollar_value, fraud_probability for each merchant per year
total_transactions_df = florist_transactions.groupBy('merchant_abn', 'order_year').agg(
    F.count("order_id").alias("total_transactions"),
    F.avg("dollar_value").alias("average_dollar_value"),  # Include average dollar value
    F.avg("fraud_probability_merchant").alias("avg_fraud_probability_merchant"),  # Include average fraud prob merchant
    F.avg("fraud_probability_consumer").alias("avg_fraud_probability_consumer"),  # Include average fraud prob consumer,
    F.first("name_merchant").alias("merchant_name"),
    F.first("merchant_category").alias("merchant_category")
)

# Step 3: Calculate repeat customers per merchant per year
repeat_customers_df = florist_transactions.groupBy("merchant_abn", "user_id").agg(
    F.count("order_id").alias("customer_transactions")
).filter(F.col("customer_transactions") > 1) \
 .groupBy("merchant_abn").agg(F.count("user_id").alias("repeat_customers"))

# Step 4: Join the total transactions and repeat customers
yearly_repeat_ratio_df = total_transactions_df.join(repeat_customers_df, on="merchant_abn", how="left")

# Step 5: Fill null values in the 'repeat_customers' column with 0
yearly_repeat_ratio_df = yearly_repeat_ratio_df.withColumn(
    "repeat_customers", F.coalesce(F.col("repeat_customers"), F.lit(0))
)

# Step 6: Calculate the repeat customer ratio for each merchant per year
yearly_repeat_ratio_df = yearly_repeat_ratio_df.withColumn(
    "repeat_customer_ratio", F.col("repeat_customers") / F.col("total_transactions")
)

# Step 7: Create lag features for past repeat customer ratios (previous year)
window_spec = Window.partitionBy('merchant_abn').orderBy('order_year')

# Add lag feature for previous year's repeat customer ratio
yearly_repeat_ratio_df = yearly_repeat_ratio_df.withColumn(
    'previous_repeat_ratio', F.lag('repeat_customer_ratio', 1).over(window_spec)
)

# Fill null values in the lag column with 0 (for the first year)
yearly_repeat_ratio_df = yearly_repeat_ratio_df.fillna({'previous_repeat_ratio': 0})

# Step 8: Add other relevant features (average dollar value, fraud probability)
# These columns are already present in yearly_repeat_ratio_df

# Display the resulting DataFrame
yearly_repeat_ratio_df.show(5)




+------------+----------+------------------+--------------------+------------------------------+------------------------------+--------------------+--------------------+----------------+---------------------+---------------------+
|merchant_abn|order_year|total_transactions|average_dollar_value|avg_fraud_probability_merchant|avg_fraud_probability_consumer|       merchant_name|   merchant_category|repeat_customers|repeat_customer_ratio|previous_repeat_ratio|
+------------+----------+------------------+--------------------+------------------------------+------------------------------+--------------------+--------------------+----------------+---------------------+---------------------+
| 10385163239|      2021|                51|   324.9159337084658|             55.77091422960027|            13.350156006300637|      Sed Et Company|florists supplies...|               0|                  0.0|                  0.0|
| 10385163239|      2022|                 9|   245.5970374443057|           

                                                                                

## Random Forest Regression to Forecast Future Repeat Customer Ratio

In [5]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Step 1: Feature Engineering
# Select relevant features for modeling
feature_columns = ['total_transactions', 'average_dollar_value', 'avg_fraud_probability_merchant', 
                   'avg_fraud_probability_consumer', 'previous_repeat_ratio']

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Scale the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Step 4: Random Forest Regressor Setup
rf = RandomForestRegressor(featuresCol='scaled_features', labelCol='repeat_customer_ratio')

# Step 5: Create a Pipeline
pipeline = Pipeline(stages=[assembler, scaler, rf])

In [6]:
# Step 6: Split Data into Training and Testing Sets
train_data, test_data = yearly_repeat_ratio_df.randomSplit([0.8, 0.2], seed=42)

# Step 7: Train the Random Forest Model
rf_model = pipeline.fit(train_data)

# Step 8: Evaluate Model on Test Data
predictions = rf_model.transform(test_data)

# Evaluate the model's performance using RMSE and R-squared
evaluator = RegressionEvaluator(labelCol="repeat_customer_ratio", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

# Show some predictions alongside actual values
predictions.select("merchant_abn", "order_year", "repeat_customer_ratio", "prediction").show(5)

24/09/25 17:20:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Root Mean Squared Error (RMSE): 0.08831263051232179
R-squared (R²): 0.8995356691250309




+------------+----------+---------------------+-------------------+
|merchant_abn|order_year|repeat_customer_ratio|         prediction|
+------------+----------+---------------------+-------------------+
| 10545955006|      2021|  0.03546099290780142|0.05830292979827208|
| 12296390138|      2021|  0.16483516483516483|0.11306415908290886|
| 13108046922|      2021|                  0.0|0.03344361048917285|
| 13401263605|      2022|                  0.0|0.03559208367838205|
| 14796275705|      2022|                  0.0|0.11774011768624475|
+------------+----------+---------------------+-------------------+
only showing top 5 rows



                                                                                

In [7]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Step 1: Generate the future projection dataset
# Project future years based on the last available year for each merchant
future_repeat_ratio_df = yearly_repeat_ratio_df.groupBy('merchant_abn').agg(F.max('order_year').alias('latest_year'))

# Generate future year column
num_future_years = 1  # Number of future years to project
future_years = [(year,) for year in range(2023, 2023 + num_future_years)]  # Specify the future years you want to predict
future_years_df = spark.createDataFrame(future_years, ['order_year'])

# Cross-join merchants with future years
future_data = future_repeat_ratio_df.crossJoin(future_years_df)

# Step 2: Join the features from the latest available year for each merchant
latest_data = yearly_repeat_ratio_df.groupBy('merchant_abn').agg(
    F.max('order_year').alias('latest_year'),
    F.first('total_transactions').alias('total_transactions'),
    F.first('average_dollar_value').alias('average_dollar_value'),
    F.first('avg_fraud_probability_merchant').alias('avg_fraud_probability_merchant'),
    F.first('avg_fraud_probability_consumer').alias('avg_fraud_probability_consumer'),
    F.first('previous_repeat_ratio').alias('previous_repeat_ratio'),
    F.first('merchant_name').alias('merchant_name'),
    F.first("merchant_category").alias("merchant_category")
)

# Join the latest available features with the future data
future_data = future_data.join(latest_data, on='merchant_abn', how='left')

# Step 3: Apply the trained model to predict the future repeat customer ratio
future_data = rf_model.transform(future_data)

# Step 4: Show predicted repeat customer ratio for future years
future_data.select('merchant_abn', 'order_year', 'prediction').show(10)


                                                                                

+------------+----------+--------------------+
|merchant_abn|order_year|          prediction|
+------------+----------+--------------------+
| 10385163239|      2023| 0.04082535339665512|
| 10545955006|      2023| 0.05830292979827208|
| 11563852275|      2023| 0.14110118784184894|
| 12296390138|      2023| 0.11306415908290886|
| 13108046922|      2023| 0.03344361048917285|
| 13297955415|      2023|  0.0272997542241512|
| 13401263605|      2023| 0.05387128452719405|
| 13942467372|      2023|0.027111806569691393|
| 14602793938|      2023|0.030893350116233703|
| 14796275705|      2023| 0.01555700016899245|
+------------+----------+--------------------+
only showing top 10 rows



In [8]:
future_predictions = future_data.withColumnRenamed('prediction', 'projected_return_cust_ratio')
future_predictions = future_predictions.orderBy(F.col('projected_return_cust_ratio').desc())

In [9]:
future_predictions = future_predictions.select('merchant_abn', 'order_year', 'total_transactions', 'average_dollar_value',
                                               'avg_fraud_probability_merchant', 'avg_fraud_probability_consumer', 'projected_return_cust_ratio','merchant_name','merchant_category')

In [10]:
future_predictions.show(truncate=False, n=10)

                                                                                

+------------+----------+------------------+--------------------+------------------------------+------------------------------+---------------------------+------------------------------+---------------------------------------------+
|merchant_abn|order_year|total_transactions|average_dollar_value|avg_fraud_probability_merchant|avg_fraud_probability_consumer|projected_return_cust_ratio|merchant_name                 |merchant_category                            |
+------------+----------+------------------+--------------------+------------------------------+------------------------------+---------------------------+------------------------------+---------------------------------------------+
|33607911449 |2023      |5029              |29.77431590295652   |29.354820107111507            |12.817290438602107            |0.2670265860786621         |Nulla Semper LLC              |florists supplies, nursery stock, and flowers|
|21772962346 |2023      |23829             |134.64395525980592  |29.

## Export Ranking and Relevant Columns

In [11]:
future_predictions = future_predictions.select('merchant_abn', 'merchant_name', 'merchant_category', 'order_year', 'projected_return_cust_ratio')

In [12]:
future_predictions.write.parquet('../data/curated/florists_ranking')

                                                                                