In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
from owslib.wfs import WebFeatureService
from dotenv import load_dotenv
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import lit

# Create a spark session
spark = (
    SparkSession.builder.appName("analysis")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executor.memory", "6g")
    .getOrCreate()
)

22/10/05 21:36:22 WARN Utils: Your hostname, DESKTOP-LNDD2A2 resolves to a loopback address: 127.0.1.1; using 172.28.37.254 instead (on interface eth0)
22/10/05 21:36:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 21:36:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sdf = spark.read.parquet("../data/curated/finaldf.parquet/")
sdf.limit(5)

                                                                                

order_datetime,merchant_abn,user_id,postcode,consumer_id,dollar_value,order_id,state,gender,company_name,tags,take_rate,revenue_band,total_pop,under10_pop,adolsc_pop,yng_adult_pop,mid_age_pop,old_pop,num_earners,median_age,median_income,mean_income,consumer_fraud_%,merchant_fraud_%
2021-08-21,80682333501,3146,5651,604753,0.3672339667473312,2a59c978-f760-42d...,SA,Male,Orci Corp.,florists supplies...,4.88,b,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-19,99478391356,3146,5651,604753,3035.1695642706595,82e100bc-25c2-4e3...,SA,Male,Orci Quis Foundation,"equipment, tool, ...",1.52,c,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-19,86578477987,3146,5651,604753,61.05946896765003,9e3c8e62-9e8e-4e8...,SA,Male,Leo In Consulting,"watch, clock, and...",6.43,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-14,32361057556,3146,5651,604753,155.3456409871304,e4ff9499-e96d-4e6...,SA,Male,Orci In Consequat...,"gift, card, novel...",6.61,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1
2021-08-16,20445424481,3146,5651,604753,57.66971365811276,4a36f2ed-7bcc-43d...,SA,Male,Amet Industries,digital goods: bo...,6.29,a,2172,275,278,304,692,623,1308,47,37685,43540,0.1,0.1


## Initial Ranking System

Ranking System uses the following features:
- future predictions for transactions per merchant per month
- future predictions for profit per merchant per month
- future predictions for customers within revenue range \\$50k-\\$75k per month

In [3]:
future_trans_sdf = spark.read.parquet("../data/curated/futuretrans.parquet/")
future_trans_sdf.limit(5)

year,month,merchant_abn,total_transactions
2023,4,51026003250,31.04763929559019
2023,4,59248738261,33.48164050911184
2023,4,22298599094,37.40681590093664
2023,4,83004629651,32.18754728743305
2023,4,10385163239,30.385875300308964


In [4]:
future_profit_sdf = spark.read.parquet("../data/curated/futureprofit.parquet/")
future_profit_sdf.limit(5)

year,month,merchant_abn,total_profit
2023,4,51026003250,330.0064557418861
2023,4,59248738261,926.925607791602
2023,4,22298599094,268.7838578500479
2023,4,83004629651,244.74317960336447
2023,4,10385163239,323.1332842674351


In [5]:
future_mcustomers_sdf = spark.read.parquet("../data/curated/futuremcustomers.parquet/")
future_mcustomers_sdf.limit(5)

                                                                                

year,month,merchant_abn,total_middle_customers
2023,7,36066395091,64.94303414454085
2023,7,87998844202,79.45028034832727
2023,7,66842618444,75.74034755604113
2023,7,46145508777,13.899866466213558
2023,7,80508375382,83.68346812191125


In [6]:
merchants_pd = future_trans_sdf \
    .select('merchant_abn') \
    .distinct() \
    .withColumn('points', lit(1)) \
    .toPandas()

merchants_pd

                                                                                

Unnamed: 0,merchant_abn,points
0,73256306726,1
1,48214071373,1
2,38700038932,1
3,73841664453,1
4,83412691377,1
...,...,...
4013,87788702467,1
4014,81906511933,1
4015,66499078575,1
4016,26465868807,1


Have 4018 distinct merchants, need to select top 100 merchants

Idea:
- all merchants have initial points 1
- the more the number of points, the higher/better the rank will be
- points given based on sorted positioning within the features and importance of the feature (initially have 2 features with equal importance)

In [7]:
WEIGHT_TRANS = 0.33
WEIGHT_PROFIT = 0.33
WEIGHT_MCUSTOMERS = 0.33

***

Get the merchant with the highest number of total transactions for each month (`ft_agg2`):

In [8]:
w = Window.partitionBy('month')

ft_agg2 = future_trans_sdf \
    .withColumn('maxT', F.max('total_transactions').over(w)) \
    .where(F.col('total_transactions') == F.col('maxT')) \
    .drop('maxT')

ft_agg2

                                                                                

year,month,merchant_abn,total_transactions
2023,1,24852446429,12706.867430622047
2023,2,24852446429,12725.976797500649
2023,3,24852446429,12601.87947537084
2023,4,24852446429,12624.618672374758
2023,5,24852446429,12557.475159081536
2023,6,24852446429,12576.861500222918
2023,7,24852446429,12540.356167994909
2023,8,24852446429,12555.113743323984
2023,9,24852446429,12535.352343858927
2023,10,24852446429,12546.09122781908


__[reference for above code](https://stackoverflow.com/questions/48829993/groupby-column-and-filter-rows-with-maximum-value-in-pyspark)__



For each month, order the merchants by increasing order of their total transactions <br>
(merchant with lowest number of transactions for that month will be at the top)

In [10]:
ordered_ft_pd = future_trans_sdf \
    .orderBy(asc("month"), asc("total_transactions")) \
    .drop("year") \
    .toPandas()

ordered_ft_pd

Unnamed: 0,month,merchant_abn,total_transactions
0,1,62346852889,21.097950
1,1,35235320738,21.149586
2,1,86791380020,21.154814
3,1,34455283435,21.199792
4,1,98082781436,21.204058
...,...,...,...
48211,12,46804135891,10163.244846
48212,12,49891706470,10724.271327
48213,12,64203420245,11311.885569
48214,12,86578477987,11805.480411


Note for below that `groupby` preserves ordering within each group

In [11]:
merch_abns = list(merchants_pd["merchant_abn"])

MONTHS = 12
for i in range(1, MONTHS+1):
    new_points = []
    
    month_data = ordered_ft_pd.groupby("month").get_group(i)
    month_data = pd.DataFrame(month_data.reset_index().drop(columns="index"))
    
    for abn in merch_abns:
        try:
            # the larger the index, the more transactions the merchant will have (ordered data)
            # thus more points awarded
            points = np.round(WEIGHT_TRANS*list(month_data.index[month_data["merchant_abn"] == abn])[0], 3)
        except IndexError:
            # have no data for the current abn
            points = 0

        new_points.append(points)

    merchants_pd["points"] = merchants_pd["points"] + new_points


Note: The rewarding points system does not account for the magnitude of the difference in number of transactions

TODO: figure out if can make the above statement possible

In [12]:
merchants_pd

Unnamed: 0,merchant_abn,points
0,73256306726,13892.35
1,48214071373,8938.39
2,38700038932,14276.47
3,73841664453,10183.81
4,83412691377,15146.68
...,...,...
4013,87788702467,4939.45
4014,81906511933,3316.18
4015,66499078575,2503.06
4016,26465868807,2718.22


Should the points be scaled so they are in the range of 1-4018 before working on the next feature?

***

Get the merchant with the highest profit for each month (`fp_agg2`):

In [13]:
w = Window.partitionBy('month')

fp_agg2 = future_profit_sdf \
    .withColumn('maxP', F.max('total_profit').over(w)) \
    .where(F.col('total_profit') == F.col('maxP')) \
    .drop('maxP')

fp_agg2

year,month,merchant_abn,total_profit
2023,1,32361057556,27740.570878728235
2023,2,32361057556,27878.7679430336
2023,3,32361057556,27552.75259841377
2023,4,32361057556,27653.557096675115
2023,5,32361057556,27479.095309639102
2023,6,32361057556,27551.766701347373
2023,7,32361057556,27461.515288951385
2023,8,32361057556,27514.536693437625
2023,9,32361057556,27471.28095464556
2023,10,32361057556,27511.178900430907


For each month, order the merchants by increasing order of their total profit <br>
(merchant with lowest total profit for that month at the top)

In [14]:
ordered_fp_pd = future_profit_sdf \
    .orderBy(asc("month"), asc("total_profit")) \
    .drop("year") \
    .toPandas()

ordered_fp_pd

Unnamed: 0,month,merchant_abn,total_profit
0,1,16156289887,169.090896
1,1,86791380020,177.765852
2,1,33068332703,179.699605
3,1,10441711491,179.786133
4,1,21350600012,179.972700
...,...,...,...
48211,12,96680767841,25475.481580
48212,12,45629217853,25860.368887
48213,12,86578477987,27096.311961
48214,12,48534649627,27268.520005


In [15]:
for i in range(1, MONTHS+1):
    new_points = []
    
    month_data = ordered_fp_pd.groupby("month").get_group(i)
    month_data = pd.DataFrame(month_data.reset_index().drop(columns="index"))
    
    for abn in merch_abns:
        try:
            # the larger the index, the more profit the merchant will have (ordered data)
            # thus more points awarded
            points = np.round(WEIGHT_PROFIT*list(month_data.index[month_data["merchant_abn"] == abn])[0], 3)
        except IndexError:
            # have no data for the current abn
            points = 0

        new_points.append(points)

    merchants_pd["points"] = merchants_pd["points"] + new_points

In [16]:
merchants_pd

Unnamed: 0,merchant_abn,points
0,73256306726,28821.88
1,48214071373,17701.87
2,38700038932,30151.45
3,73841664453,18054.31
4,83412691377,26337.31
...,...,...
4013,87788702467,7713.43
4014,81906511933,9869.65
4015,66499078575,5625.19
4016,26465868807,8099.86


***

Get the merchant with the highest number of middle customers for each month (`fmc_agg2`):

In [18]:
w = Window.partitionBy('month')

fmc_agg2 = future_mcustomers_sdf \
    .withColumn('maxMC', F.max('total_middle_customers').over(w)) \
    .where(F.col('total_middle_customers') == F.col('maxMC')) \
    .drop('maxMC')

fmc_agg2

year,month,merchant_abn,total_middle_customers
2023,1,24852446429,2796.8587999146635
2023,2,24852446429,2794.892202393302
2023,3,24852446429,2781.9611881292603
2023,4,24852446429,2781.828716128044
2023,5,24852446429,2775.997045578518
2023,6,24852446429,2776.425821608259
2023,7,24852446429,2773.9088483222054
2023,8,24852446429,2774.460895292936
2023,9,24852446429,2773.4974495653432
2023,10,24852446429,2774.0415413334777


For each month, order the merchants by increasing order of their total profit <br>
(merchant with lowest total profit for that month at the top)

In [19]:
ordered_fmc_pd = future_mcustomers_sdf \
    .orderBy(asc("month"), asc("total_middle_customers")) \
    .drop("year") \
    .toPandas()

ordered_fmc_pd

Unnamed: 0,month,merchant_abn,total_middle_customers
0,1,28836033916,4.850240
1,1,14626521979,4.850240
2,1,37706925794,4.850240
3,1,95594775419,4.850240
4,1,37670671283,4.850240
...,...,...,...
48211,12,46804135891,2379.631110
48212,12,49891706470,2465.908543
48213,12,64203420245,2576.013135
48214,12,86578477987,2670.840636


In [20]:
for i in range(1, MONTHS+1):
    new_points = []
    
    month_data = ordered_fmc_pd.groupby("month").get_group(i)
    month_data = pd.DataFrame(month_data.reset_index().drop(columns="index"))
    
    for abn in merch_abns:
        try:
            # the larger the index, the more profit the merchant will have (ordered data)
            # thus more points awarded
            points = np.round(WEIGHT_PROFIT*list(month_data.index[month_data["merchant_abn"] == abn])[0], 3)
        except IndexError:
            # have no data for the current abn
            points = 0

        new_points.append(points)

    merchants_pd["points"] = merchants_pd["points"] + new_points

In [21]:
merchants_pd

Unnamed: 0,merchant_abn,points
0,73256306726,42630.40
1,48214071373,26151.52
2,38700038932,44411.41
3,73841664453,28182.01
4,83412691377,41500.81
...,...,...
4013,87788702467,12622.51
4014,81906511933,13085.83
4015,66499078575,8791.21
4016,26465868807,10997.59


***

Get the top merchants

In [22]:
rankings = spark.createDataFrame(merchants_pd).sort(desc("points"))
rankings

                                                                                

merchant_abn,points
86578477987,47707.12
45629217853,47671.47999999997
89726005175,47640.45999999997
49891706470,47620.00000000001
21439773999,47616.04000000002
64403598239,47603.83000000002
32361057556,47588.32000000001
43186523025,47579.74000000001
94493496784,47564.560000000056
72472909171,47558.95


## Segmentation