In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import lit

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

In [6]:
sdf = spark.read.parquet("../data/curated/finaldf.parquet/")
sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2022-07-13,49891706470,22131,6731,58911,25.782245737474312,90498b80-984f-43c...,WA,Male,Non Vestibulum In...,tent and awning s...,5.8,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,96680767841,22131,6731,58911,415.9547984625,829843bc-c571-493...,WA,Male,Ornare Limited,motor vehicle sup...,5.91,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2022-07-13,43186523025,22131,6731,58911,33.082322196774484,c12d2520-12c8-40f...,WA,Male,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2021-08-19,66370248931,22131,6731,58911,82.54046571771035,cc8ac5c5-0dfc-4e1...,WA,Male,Morbi Non PC,"cable, satellite,...",3.15,b,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1
2022-05-06,89726005175,22131,6731,58911,58.46693553052902,b1f179f8-2c15-4f5...,WA,Male,Est Nunc Consulting,tent and awning s...,6.01,a,8374,1373,1185,2200,2600,1016,2907,39,48034,56306,0.1,0.1


## Initial Ranking System

Ranking System currently only using the following features:
- future predictions for transactions per merchant per day
- future predictions for profit per merchant per day

In [3]:
future_trans_sdf = spark.read.parquet("../data/curated/futuretrans.parquet/")
future_trans_sdf.limit(5)

                                                                                

year,month,day,merchant_abn,prediction
2025,1,29,20734004951,1.8555858310626705
2025,4,17,20734004951,1.8555858310626705
2025,4,24,20734004951,1.8555858310626705
2025,4,4,20734004951,1.8555858310626705
2025,7,5,20734004951,2.090909090909091


In [5]:
future_profit_sdf = spark.read.parquet("../data/curated/futureprofit.parquet/")
future_profit_sdf.limit(5)

year,month,day,merchant_abn,prediction
2025,2,23,86578477987,53.06774628167469
2025,1,10,86578477987,53.06774628167469
2025,8,9,86578477987,526.6029702970297
2025,7,3,86578477987,526.6029702970297
2025,10,7,86578477987,526.6029702970297


In [128]:
merchants_pd = future_trans_sdf \
    .select('merchant_abn') \
    .distinct() \
    .withColumn('points', lit(1)) \
    .toPandas()

merchants_pd

Unnamed: 0,merchant_abn,points
0,29216160692,1
1,24526454463,1
2,28836033916,1
3,23617533061,1
4,24314208971,1
...,...,...
4013,81249812970,1
4014,81906511933,1
4015,82063890289,1
4016,82803045102,1


Have 4018 distinct merchants, need to select top 100 merchants

Idea:
- all merchants have initial points 1
- the more the number of points, the higher/better the rank will be
- points given based on sorted positioning within the features and importance of the feature (initially have 2 features with equal importance)

In [75]:
WEIGHT_TRANS = 0.5
WEIGHT_PROFIT = 0.5

***

Get the monthly number of transactions for each merchant

In [48]:
ft_agg1 = future_trans_sdf.groupBy("year", "month", "merchant_abn").agg(
    F.round(sum("prediction"), 2).alias("total_transactions")
)

ft_agg1.limit(5)

year,month,merchant_abn,total_transactions
2025,12,20774839004,6.27
2025,8,20847737557,65.45
2025,8,20985347699,2878.52
2025,7,21166508693,80.61
2025,12,21166508693,71.66


Get the merchant with the highest number of total transactions for each month (`ft_agg2`):

In [49]:
w = Window.partitionBy('month')

ft_agg2 = ft_agg1 \
    .withColumn('maxT', F.max('total_transactions').over(w)) \
    .where(F.col('total_transactions') == F.col('maxT')) \
    .drop('maxT')

ft_agg2

year,month,merchant_abn,total_transactions
2025,1,24852446429,7069.85
2025,2,86578477987,5449.68
2025,3,24852446429,11902.78
2025,4,24852446429,12986.61
2025,5,24852446429,13649.64
2025,6,24852446429,12991.22
2025,7,24852446429,13183.31
2025,8,24852446429,13497.98
2025,9,24852446429,13234.83
2025,10,24852446429,13474.72


__[reference for above code](https://stackoverflow.com/questions/48829993/groupby-column-and-filter-rows-with-maximum-value-in-pyspark)__



For each month, order the merchants by increasing order of their total transactions <br>
(merchant with lowest number of transactions for that month will be at the top)

In [105]:
ordered_ft_pd = ft_agg1 \
    .orderBy(asc("month"), asc("total_transactions")) \
    .drop("year") \
    .toPandas()

ordered_ft_pd

Unnamed: 0,month,merchant_abn,total_transactions
0,1,36300724008,1.19
1,1,64464998822,1.25
2,1,61333423388,1.26
3,1,61243536058,1.26
4,1,23322235350,1.34
...,...,...,...
45016,12,49839448838,6644.33
45017,12,49891706470,6644.33
45018,12,63290521567,6702.10
45019,12,24852446429,6966.12


Note for below that `groupby` preserves ordering within each group

In [133]:
merch_abns = list(merchants_pd["merchant_abn"])

MONTHS = 12
for i in range(1, MONTHS+1):
    new_points = []
    
    month_data = ordered_ft_pd.groupby("month").get_group(i)
    month_data = pd.DataFrame(month_data.reset_index().drop(columns="index"))
    
    for abn in merch_abns:
        try:
            # the larger the index, the more transactions the merchant will have (ordered data)
            # thus more points awarded
            points = np.round(WEIGHT_TRANS*list(month_data.index[month_data["merchant_abn"] == abn])[0], 3)
        except IndexError:
            # have no data for the current abn
            points = 0

        new_points.append(points)

    merchants_pd["points"] = merchants_pd["points"] + new_points


Note: The rewarding points system does not account for the magnitude of the difference in number of transactions

TODO: figure out if can make the above statement possible

In [134]:
merchants_pd

Unnamed: 0,merchant_abn,points
0,29216160692,32195.0
1,24526454463,1355.0
2,28836033916,137.0
3,23617533061,32691.0
4,24314208971,2570.5
...,...,...
4013,81249812970,42801.0
4014,81906511933,8439.0
4015,82063890289,5421.5
4016,82803045102,7105.5


Should the points be scaled so they are in the range of 1-4018 before working on the next feature?

***

Get the monthly total profit for each merchant

In [50]:
fp_agg1 = future_profit_sdf.groupBy("year", "month", "merchant_abn").agg(
    F.round(sum("prediction"), 2).alias("total_profit")
)

fp_agg1.limit(5)

year,month,merchant_abn,total_profit
2025,11,87084550311,6364.34
2025,2,87340243269,1326.69
2025,2,87630626808,1538.96
2025,6,87769349119,491.54
2025,7,87998844202,2539.61


Get the merchant with the highest profit for each month (`fp_agg2`):

In [51]:
w = Window.partitionBy('month')

fp_agg2 = fp_agg1 \
    .withColumn('maxP', F.max('total_profit').over(w)) \
    .where(F.col('total_profit') == F.col('maxP')) \
    .drop('maxP')

fp_agg2

year,month,merchant_abn,total_profit
2025,1,32361057556,15872.74
2025,2,21439773999,14585.16
2025,3,21439773999,44487.39
2025,4,21439773999,38037.4
2025,5,79827781481,58545.5
2025,6,79827781481,56656.94
2025,7,79827781481,58545.5
2025,8,79827781481,58545.5
2025,9,79827781481,55712.65
2025,10,79827781481,53824.09


For each month, order the merchants by increasing order of their total profit <br>
(merchant with lowest total profit for that month at the top)

In [136]:
ordered_fp_pd = fp_agg1 \
    .orderBy(asc("month"), asc("total_profit")) \
    .drop("year") \
    .toPandas()

ordered_fp_pd

Unnamed: 0,month,merchant_abn,total_profit
0,1,26945291705,9.74
1,1,34339412732,10.42
2,1,17649195875,11.07
3,1,13067850740,11.50
4,1,19458945909,13.04
...,...,...,...
45016,12,82368304209,20173.09
45017,12,72472909171,23471.62
45018,12,98166254020,25341.71
45019,12,21439773999,25724.52


In [137]:
for i in range(1, MONTHS+1):
    new_points = []
    
    month_data = ordered_fp_pd.groupby("month").get_group(i)
    month_data = pd.DataFrame(month_data.reset_index().drop(columns="index"))
    
    for abn in merch_abns:
        try:
            # the larger the index, the more profit the merchant will have (ordered data)
            # thus more points awarded
            points = np.round(WEIGHT_PROFIT*list(month_data.index[month_data["merchant_abn"] == abn])[0], 3)
        except IndexError:
            # have no data for the current abn
            points = 0

        new_points.append(points)

    merchants_pd["points"] = merchants_pd["points"] + new_points

In [138]:
merchants_pd

Unnamed: 0,merchant_abn,points
0,29216160692,45918.5
1,24526454463,1512.0
2,28836033916,391.0
3,23617533061,44161.0
4,24314208971,4532.5
...,...,...
4013,81249812970,59836.0
4014,81906511933,12266.0
4015,82063890289,7133.5
4016,82803045102,9348.5


Get the top merchants

In [142]:
rankings = spark.createDataFrame(merchants_pd).sort(desc("points"))
rankings

merchant_abn,points
21439773999,67893.5
49891706470,67812.0
72472909171,67698.5
86578477987,67640.0
79417999332,67420.0
86591172660,67215.5
76819856970,66933.0
76767266140,66890.5
22033359776,66836.5
77590625261,66571.5


## Segmentation