In [1]:
# create modeling spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Project 2')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [2]:
from pyspark.sql import functions as F

In [3]:
data = spark.read.parquet('../data/curated/final_dataset')
data.limit(5)

business_area_type,merchant_abn,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage
Retail trade,11788487195,184,1087.1773317511631,Id Magna LLP,music shops - mus...,b,3.51,0.0802806368019285,0.0871576086956521,21.9
Retail trade,59962058395,377,306.7863223809038,Pellentesque Tinc...,florists supplies...,d,1.31,0.2537913711493971,0.0833965517241379,21.9
Retail trade,33765505000,29,829.8756270760261,Ut Nulla Associates,tent and awning s...,c,1.58,-0.03067390145974...,0.0833965517241379,21.9
Manufacturing,72167534195,778,93.60350729533076,Quisque Libero Co...,"opticians, optica...",b,4.11,0.1763360446096375,0.0858136246786632,17.0
Retail trade,87285933135,324,97.10476271896748,Proin Vel Associates,shoe shops,c,1.67,0.2756853056509352,0.0870061728395061,21.9


If we use annual transaction count to predict revenue, then run the following

In [4]:
clean_transaction = spark.read.parquet('../data/curated/clean_full_dataset')

In [5]:
annual_transaction_count = clean_transaction.filter(F.col("order_datetime")>"2021-08-27")

In [6]:
annual_merchant_transaction = annual_transaction_count.groupby("merchant_abn").count()

In [7]:
full_data = data.join(annual_merchant_transaction, on="merchant_abn", how="left")

In [8]:
full_data = full_data.withColumn("ap_rate", (F.col("consumer_scaled_spare_money")*0.1+F.col("ap_percentage_by_gender").cast("float")))

In [9]:
# cal
full_data = full_data.withColumn("expected_transaction", (F.col("total_transactions_count")*
                                 (1+F.col("annual_turnover_percentage")/100)).cast("int"))

In [22]:
full_data = full_data.withColumn("expected_revenue", \
                        ((F.col("expected_transaction")*F.col("ap_percentage_by_gender")).cast("int")\
                        *F.col("avg_total_value")+(F.col("expected_transaction")*0.077*\
                            F.col("avg_total_value")))*F.col("take_rate")/100)


In [10]:
full_data = full_data.withColumn("expected_revenue", \
                        ((F.col("expected_transaction")*1.077*F.col("ap_rate")).cast("int")\
                        *F.col("avg_total_value")*F.col("take_rate")/100))



In [12]:
full_data

merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,count,ap_rate,expected_transaction,expected_revenue
48214071373,Retail trade,469,300.94198389965146,Ridiculus LLC,motor vehicle sup...,b,4.02,0.12917558022955608,0.0864754797441364,21.9,341,0.09939303464553709,571,737.9699329187252
73256306726,Other services,4749,284.38398242669064,Id LLP,health and beauty...,b,4.81,0.03254363266004481,0.0850504316698252,2.4,3371,0.08830479714013473,4862,6319.637734282404
83412691377,Other services,12748,34.92686555664642,Suspendisse Sagit...,"watch, clock, and...",c,2.94,-0.00802709826851...,0.0850262786319422,2.4,9103,0.0842235692649837,13053,1215.7902192806391
34440496342,Manufacturing,192,89.5003676383182,Mauris Nulla Inte...,"opticians, optica...",c,2.85,0.2587477349990235,0.0897890625,17.0,135,0.11566383623832092,224,68.87053289768585
38700038932,Retail trade,6383,1337.4804473531829,Etiam Bibendum In...,tent and awning s...,a,6.31,-0.06052124811389448,0.0852085226382578,21.9,4537,0.07915639548184633,7780,55953.895759154606
73841664453,Retail trade,852,86.86264397291626,Lacinia At LLP,digital goods: bo...,a,5.55,0.13532990735039052,0.0846050469483568,21.9,591,0.09813803635166579,1038,525.4755647141569
78916025936,Retail trade,64,319.68115127325086,Urna Nec Corporation,florists supplies...,e,0.37,0.21464664863316452,0.0864531249999999,21.9,48,0.10791778974410716,78,10.645382337399257
60654402457,Retail trade,171,86.2293852509412,Lacus Quisque Imp...,digital goods: bo...,b,3.52,-4.008149999537232,0.0794356725146198,21.9,119,-0.32137933106801536,208,-215.50447961915225
19839532017,Retail trade,674,157.0,Pellentesque Habi...,"cable, satellite,...",b,4.94,-0.8502169050815052,0.0852284866468842,21.9,484,2.067973410850160...,821,0.0
38986645707,Retail trade,38,943.1286654741962,Lectus Justo LLC,bicycle shops - s...,c,1.72,0.27008298844572065,0.0895131578947368,21.9,30,0.11652145913290596,46,81.10906523078087


In [17]:
ranking = full_data.select("merchant_abn", "name", "business_area", "expected_revenue")

In [18]:
ranking.orderBy(F.col("expected_revenue").desc())

merchant_abn,name,business_area,expected_revenue
32361057556,Orci In Consequat...,"gift, card, novel...",62224.66569754494
45629217853,Lacus Consulting,"gift, card, novel...",60079.36627286218
96680767841,Ornare Limited,motor vehicle sup...,56982.14902073573
21439773999,Mauris Non Institute,"cable, satellite,...",56548.880127988654
38700038932,Etiam Bibendum In...,tent and awning s...,55953.895759154606
48534649627,Dignissim Maecena...,"opticians, optica...",55471.7824868209
89726005175,Est Nunc Consulting,tent and awning s...,55366.675559525
19492220327,Commodo Ipsum Ind...,"jewelry, watch, c...",55081.61016991129
64403598239,Lobortis Ultrices...,music shops - mus...,54593.53422874367
67978471888,Magna Malesuada C...,artist supply and...,53131.83946357304


In [13]:
full_data.orderBy(F.col("expected_revenue").desc())

merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,count,ap_rate,expected_transaction,expected_revenue
32361057556,Retail trade,76349,109.94763424911493,Orci In Consequat...,"gift, card, novel...",a,6.61,0.001298628265308...,0.0852938676341537,21.9,54388,0.0854237295205043,93069,62224.66569754494
45629217853,Retail trade,203150,36.84341693724839,Lacus Consulting,"gift, card, novel...",a,6.98,0.023380752514037626,0.0852564804331774,21.9,145080,0.0875945549319419,247639,60079.36627286218
96680767841,Retail trade,27728,314.9836517640228,Ornare Limited,motor vehicle sup...,a,5.91,-0.01106999040695...,0.0852151074725908,21.9,19881,0.0841081075657882,33800,56982.14902073573
21439773999,Retail trade,107366,78.1315484003629,Mauris Non Institute,"cable, satellite,...",a,6.1,-0.01118632446995...,0.0852954659761935,21.9,76429,0.0841768361218063,130879,56548.880127988654
38700038932,Retail trade,6383,1337.4804473531829,Etiam Bibendum In...,tent and awning s...,a,6.31,-0.06052124811389448,0.0852085226382578,21.9,4537,0.0791563954818463,7780,55953.895759154606
48534649627,Manufacturing,59045,141.7645360505851,Dignissim Maecena...,"opticians, optica...",a,6.64,-0.05905459218211...,0.0851131001778304,17.0,42080,0.0792076414893195,69082,55471.7824868209
89726005175,Retail trade,192212,41.23366403138043,Est Nunc Consulting,tent and awning s...,a,6.01,0.03259640249317893,0.0852783254947662,21.9,137037,0.0885379650321662,234306,55366.675559525
19492220327,Retail trade,738,9975.661071050292,Commodo Ipsum Ind...,"jewelry, watch, c...",b,4.93,0.3041444891863695,0.0852933604336043,21.9,540,0.1157078089731299,899,55081.61016991129
64403598239,Retail trade,101021,78.13516813680323,Lobortis Ultrices...,music shops - mus...,a,6.31,-0.01783674053447056,0.0852796002811296,21.9,71857,0.0834959247786833,123144,54593.53422874367
67978471888,Retail trade,11095,648.30978524122,Magna Malesuada C...,artist supply and...,a,5.56,0.15783094338825146,0.0854807570977918,21.9,7929,0.1012638513964919,13524,53131.83946357304


In [45]:
busi_area_type = []