In [1]:
# create modeling spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Project 2')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)
import pandas as pd
from pyspark.sql import functions as F
import pathlib

In [2]:
# create directory of rank folder
pathlib.Path("../data/curated/rank").mkdir(parents=True, exist_ok=True)

In [3]:
data = spark.read.parquet('../data/curated/final_dataset')

Use annual transaction count to predict future revenue

In [4]:
clean_transaction = spark.read.parquet('../data/curated/clean_full_dataset')
annual_transaction_count = clean_transaction.filter(F.col("order_datetime")>"2021-08-27")
annual_merchant_transaction = annual_transaction_count.groupby("merchant_abn").count().select("merchant_abn", "count")
full_data = data.join(annual_merchant_transaction, on="merchant_abn", how="left")


In [5]:
# calculate proportion of potential afterpay users
full_data = full_data.withColumn("ap_rate", (F.col("consumer_scaled_spare_money")*0.1+F.col("ap_percentage_by_gender").cast("float")))

In [6]:
busi_area_type = [
    "fashion", "fashion", "electronics", "recreation", 
    "recreation", "household", "recreation", "household",
    "recreation", "household", "electronics", "household",
    "recreation", "household", "household", "electronics",
    "fashion", "fashion", "recreation", "electronics",
    "recreation", "recreation", "household", "household", "household"]

busi_area= full_data.groupby('business_area').count().select("business_area").toPandas()['business_area'].to_list()


allocation_sdf=spark.createDataFrame(pd.DataFrame(list(zip(busi_area, busi_area_type)),
               columns =['business_area', 'business_segment']))

full_data = full_data.join(allocation_sdf, on = "business_area")

In [7]:
# calculate expected transaction counts for next year
full_data = full_data.withColumn("expected_transaction", (F.col("count")*
                                 (1+F.col("annual_turnover_percentage")/100)).cast("int"))

In [8]:
# there are ap_rate of less than 0, replace this rate by 0.001
full_data = full_data.withColumn("ap_rate", F.when(full_data["ap_rate"] < 0, 0.001).otherwise(full_data["ap_rate"]))

In [9]:
# afterpay user = transaction*1.077*aprate
full_data = full_data.withColumn("expected_revenue", \
                        ((F.col("expected_transaction")*1.077*F.col("ap_rate")).cast("int")\
                        *F.col("avg_total_value"))*F.col("take_rate")/100)

In [10]:
full_data.limit(5)

business_area,merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,fraud_rate,count,ap_rate,business_segment,expected_transaction,expected_revenue
antique shops - s...,37935728745,Retail trade,9,11317.019602391396,Laoreet Lectus Qu...,b,4.12,0.0966893187534178,0.0922777777777777,21.9,0.4444444444444444,6,0.1019467120505508,household,7,0.0
antique shops - s...,46916077029,Retail trade,71,13522.620077171276,Mauris Aliquam LLC,a,6.8,0.2560244766281576,0.0872183098591549,21.9,0.4788732394366197,53,0.1128207546214911,household,64,6436.767156733527
antique shops - s...,21319642626,Retail trade,8,22661.382077424903,At Risus Associates,b,3.29,0.4466573950774365,0.0816875,21.9,0.5,6,0.1263532420707434,household,7,0.0
antique shops - s...,11024352823,Retail trade,379,214.29456949819507,Aliquet Metus Urn...,c,2.62,0.2651272370833886,0.0859828496042216,21.9,0.0026385224274406,275,0.1124955753924323,household,335,224.5807088341084
antique shops - s...,83173175334,Retail trade,16,13574.87446695437,Enim Gravida Inc.,a,5.81,0.5519064947172286,0.07596875,21.9,0.5625,11,0.1311593992929089,household,13,788.700206530049


In [12]:
ranking = full_data.select("merchant_abn", "name", "business_area", "expected_revenue", "avg_total_value",\
                           "fraud_rate", "ap_rate", "revenue_level", "business_segment", "expected_transaction")

In [13]:
ranking = ranking.withColumn("true_revenue", (F.col("expected_revenue")*(1-F.col("fraud_rate"))))\
                .orderBy(F.col("true_revenue").desc())

In [26]:
ranking = ranking.drop("expected_revenue", "fraud_rate", "ap_rate")
ranking

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
32361057556,Orci In Consequat...,"gift, card, novel...",109.98738471647025,a,recreation,65016,43766.74291931224
45629217853,Lacus Consulting,"gift, card, novel...",36.86309255250773,a,recreation,173438,42207.57778690396
96680767841,Ornare Limited,motor vehicle sup...,315.06861309524027,a,household,23763,39728.91708258423
21439773999,Mauris Non Institute,"cable, satellite,...",78.14540729233093,a,electronics,91406,39120.00430848647
89726005175,Est Nunc Consulting,tent and awning s...,41.23187459923756,a,household,163743,38512.82005309113
64403598239,Lobortis Ultrices...,music shops - mus...,78.11269321474416,a,recreation,85967,38210.088844639424
48534649627,Dignissim Maecena...,"opticians, optica...",141.65798620362992,a,fashion,48309,37992.51900873453
67978471888,Magna Malesuada C...,artist supply and...,648.8876206534682,a,recreation,9454,36923.28489682172
38700038932,Etiam Bibendum In...,tent and awning s...,1336.2436316547887,a,household,5411,36430.26875071515
86578477987,Leo In Consulting,"watch, clock, and...",34.98176268056113,a,fashion,173786,36308.45974929885


In [15]:
household_rank = ranking.filter(F.col("business_segment")=="household").orderBy(F.col("true_revenue").desc())
household_rank.write.mode("overwrite").parquet("../data/curated/rank/household_rank/")
household_rank

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
96680767841,Ornare Limited,motor vehicle sup...,315.06861309524027,a,household,23763,39728.91708258423
89726005175,Est Nunc Consulting,tent and awning s...,41.23187459923756,a,household,163743,38512.82005309113
38700038932,Etiam Bibendum In...,tent and awning s...,1336.2436316547887,a,household,5411,36430.26875071515
80551528183,Ac Ipsum LLC,tent and awning s...,1125.4718484638406,b,household,5919,31630.093244327345
76767266140,Phasellus At Limited,"furniture, home f...",214.48522155694192,b,household,31572,30628.084659145745
49549583265,Luctus Et Incorpo...,tent and awning s...,678.704234522669,a,household,7611,30100.626867241703
79827781481,Amet Risus Inc.,"furniture, home f...",2036.4814449572923,a,household,3442,28800.457005251534
49891706470,Non Vestibulum In...,tent and awning s...,28.980731305967694,a,household,187350,28426.296434780867
43186523025,Lorem Ipsum Sodal...,florists supplies...,44.99407448848733,b,household,152011,28308.97741075364
90568944804,Diam Eu Dolor LLC,tent and awning s...,898.5753673253311,b,household,8157,26953.984708165717


In [17]:
recreation_rank = ranking.filter(F.col("business_segment")=="recreation").orderBy(F.col("true_revenue").desc())
recreation_rank.write.mode("overwrite").parquet("../data/curated/rank/recreation_rank/")
recreation_rank

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
32361057556,Orci In Consequat...,"gift, card, novel...",109.98738471647025,a,recreation,65016,43766.74291931224
45629217853,Lacus Consulting,"gift, card, novel...",36.86309255250773,a,recreation,173438,42207.57778690396
64403598239,Lobortis Ultrices...,music shops - mus...,78.11269321474416,a,recreation,85967,38210.088844639424
67978471888,Magna Malesuada C...,artist supply and...,648.8876206534682,a,recreation,9454,36923.28489682172
94493496784,Dictum Phasellus ...,"gift, card, novel...",91.89829533112882,a,recreation,75281,36284.53364046986
40515428545,Elit Sed Consequa...,artist supply and...,596.5734154604642,a,recreation,9881,34353.536434727845
63123845164,Odio Phasellus In...,artist supply and...,751.2370990170103,a,recreation,8641,33735.956793983576
79417999332,Phasellus At Company,"gift, card, novel...",91.9414464701124,b,recreation,75447,33074.7818877211
72472909171,Nullam Consulting,digital goods: bo...,70.31518680234745,a,recreation,77204,32167.069700367047
27326652377,Tellus Aenean Cor...,music shops - mus...,1009.5896252964578,a,recreation,4910,30423.656221079516


In [18]:
electronic_rank = ranking.filter(F.col("business_segment")=="electronics").orderBy(F.col("true_revenue").desc())
electronic_rank.write.mode("overwrite").parquet("../data/curated/rank/electronic_rank/")
electronic_rank

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
21439773999,Mauris Non Institute,"cable, satellite,...",78.14540729233093,a,electronics,91406,39120.00430848647
35909341340,Arcu Sed Eu Incor...,computer programm...,250.7705228080561,b,electronics,26338,32279.682572125555
45433476494,Adipiscing Elit F...,"computers, comput...",450.93672527389566,a,electronics,12121,31161.95303567217
82368304209,Nec Incorporated,telecom,1867.954943974729,a,electronics,3720,31146.11606194541
94690988633,Eu Placerat LLC,"computers, comput...",202.0520362068834,a,electronics,23614,27964.158872441654
58454491168,Diam At Foundation,computer programm...,251.55068227498936,a,electronics,18454,24132.20391261745
80518954462,Neque Sed Dictum ...,"computers, comput...",300.9737813663696,b,electronics,22628,23198.08590220679
34096466752,Nullam Enim Ltd,"computers, comput...",500.5880908303476,b,electronics,14008,21165.01895563647
67400260923,Eleifend PC,computer programm...,149.812854422409,a,electronics,25908,20027.85742375301
77590625261,Sed Diam Foundation,computer programm...,301.96276795289305,b,electronics,17412,16560.265927804096


In [19]:
fashion_rank = ranking.filter(F.col("business_segment")=="fashion").orderBy(F.col("true_revenue").desc())
fashion_rank.write.mode("overwrite").parquet("../data/curated/rank/fashion_rank/")
fashion_rank

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
48534649627,Dignissim Maecena...,"opticians, optica...",141.65798620362992,a,fashion,48309,37992.51900873453
86578477987,Leo In Consulting,"watch, clock, and...",34.98176268056113,a,fashion,173786,36308.45974929885
49322182190,Gravida Mauris In...,"watch, clock, and...",151.2973045284807,a,fashion,33009,28798.587560997054
19492220327,Commodo Ipsum Ind...,"jewelry, watch, c...",9997.797797783242,b,fashion,649,20767.51858832556
11439466003,Blandit At LLC,shoe shops,163.07968880593438,a,fashion,22619,20088.496428337545
93558142492,Dolor Quisque Inc.,shoe shops,405.6866231203035,b,fashion,16544,18853.320486544595
95574756848,At Pede Inc.,"opticians, optica...",357.41601111620497,a,fashion,8266,17358.93264217874
99976658299,Sociosqu Corp.,shoe shops,149.92402158371087,a,fashion,17440,16660.979479032503
46804135891,Suspendisse Dui C...,"opticians, optica...",29.98490952653827,c,fashion,170658,13434.684057044791
81761494572,Nulla Facilisis I...,"watch, clock, and...",115.0319859686225,a,fashion,16826,13369.956102551712


Top 100 merchants

In [27]:
top100 = ranking.limit(100)
top100

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
32361057556,Orci In Consequat...,"gift, card, novel...",109.98738471647025,a,recreation,65016,43766.74291931224
45629217853,Lacus Consulting,"gift, card, novel...",36.86309255250773,a,recreation,173438,42207.57778690396
96680767841,Ornare Limited,motor vehicle sup...,315.06861309524027,a,household,23763,39728.91708258423
21439773999,Mauris Non Institute,"cable, satellite,...",78.14540729233093,a,electronics,91406,39120.00430848647
89726005175,Est Nunc Consulting,tent and awning s...,41.23187459923756,a,household,163743,38512.82005309113
64403598239,Lobortis Ultrices...,music shops - mus...,78.11269321474416,a,recreation,85967,38210.088844639424
48534649627,Dignissim Maecena...,"opticians, optica...",141.65798620362992,a,fashion,48309,37992.51900873453
67978471888,Magna Malesuada C...,artist supply and...,648.8876206534682,a,recreation,9454,36923.28489682172
38700038932,Etiam Bibendum In...,tent and awning s...,1336.2436316547887,a,household,5411,36430.26875071515
86578477987,Leo In Consulting,"watch, clock, and...",34.98176268056113,a,fashion,173786,36308.45974929885


In [24]:
top10fashion = fashion_rank.limit(10)
top10electric = electronic_rank.limit(10)
top10recreation = recreation_rank.limit(10)
top10household = household_rank.limit(10)

In [25]:
top10fashion

merchant_abn,name,business_area,avg_total_value,revenue_level,business_segment,expected_transaction,true_revenue
48534649627,Dignissim Maecena...,"opticians, optica...",141.65798620362992,a,fashion,48309,37992.51900873453
86578477987,Leo In Consulting,"watch, clock, and...",34.98176268056113,a,fashion,173786,36308.45974929885
49322182190,Gravida Mauris In...,"watch, clock, and...",151.2973045284807,a,fashion,33009,28798.587560997054
19492220327,Commodo Ipsum Ind...,"jewelry, watch, c...",9997.797797783242,b,fashion,649,20767.51858832556
11439466003,Blandit At LLC,shoe shops,163.07968880593438,a,fashion,22619,20088.496428337545
93558142492,Dolor Quisque Inc.,shoe shops,405.6866231203035,b,fashion,16544,18853.320486544595
95574756848,At Pede Inc.,"opticians, optica...",357.41601111620497,a,fashion,8266,17358.93264217874
99976658299,Sociosqu Corp.,shoe shops,149.92402158371087,a,fashion,17440,16660.979479032503
46804135891,Suspendisse Dui C...,"opticians, optica...",29.98490952653827,c,fashion,170658,13434.684057044791
81761494572,Nulla Facilisis I...,"watch, clock, and...",115.0319859686225,a,fashion,16826,13369.956102551712
