In [1]:
# create modeling spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Project 2')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [6]:
from pyspark.sql import functions as F

In [3]:
data = spark.read.parquet('../data/curated/final_dataset')
data.limit(5)

business_area_type,merchant_abn,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage
Retail trade,11788487195,184,1087.1773317511631,Id Magna LLP,music shops - mus...,b,3.51,0.0802806368019285,0.0871576086956521,21.9
Retail trade,59962058395,377,306.7863223809038,Pellentesque Tinc...,florists supplies...,d,1.31,0.2537913711493971,0.0833965517241379,21.9
Retail trade,33765505000,29,829.8756270760261,Ut Nulla Associates,tent and awning s...,c,1.58,-0.03067390145974...,0.0833965517241379,21.9
Manufacturing,72167534195,778,93.60350729533076,Quisque Libero Co...,"opticians, optica...",b,4.11,0.1763360446096375,0.0858136246786632,17.0
Retail trade,87285933135,324,97.10476271896748,Proin Vel Associates,shoe shops,c,1.67,0.2756853056509352,0.0870061728395061,21.9


If we use annual transaction count to predict revenue, then run the following

In [4]:
clean_transaction = spark.read.parquet('../data/curated/clean_full_dataset')

In [7]:
annual_transaction_count = clean_transaction.filter(F.col("order_datetime")>"2021-08-27")

In [9]:
annual_merchant_transaction = annual_transaction_count.groupby("merchant_abn").count()

In [36]:
full_data = data.join(annual_merchant_transaction, on="merchant_abn", how="left")

In [46]:
full_data

merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,count,expected_transaction,expected_revenue
48214071373,Retail trade,469,300.94198389965146,Ridiculus LLC,motor vehicle sup...,b,4.02,0.12917558022955608,0.0864754797441364,21.9,341,571,1076.34729396359
73256306726,Other services,4749,284.38398242669064,Id LLP,health and beauty...,b,4.81,0.03254363266004481,0.0850504316698252,2.4,3371,4862,10304.839590355645
83412691377,Other services,12748,34.92686555664642,Suspendisse Sagit...,"watch, clock, and...",c,2.94,-0.00802709826851...,0.0850262786319422,2.4,9103,13053,2077.0194547644774
34440496342,Manufacturing,192,89.5003676383182,Mauris Nulla Inte...,"opticians, optica...",c,2.85,0.2587477349990235,0.0897890625,17.0,135,224,91.01113384405302
38700038932,Retail trade,6383,1337.4804473531829,Etiam Bibendum In...,tent and awning s...,a,6.31,-0.06052124811389448,0.0852085226382578,21.9,4537,7780,101831.02658068771
73841664453,Retail trade,852,86.86264397291626,Lacinia At LLP,digital goods: bo...,a,5.55,0.13532990735039052,0.0846050469483568,21.9,591,1038,769.7011803877275
78916025936,Retail trade,64,319.68115127325086,Urna Nec Corporation,florists supplies...,e,0.37,0.21464664863316452,0.0864531249999999,21.9,48,78,13.555120176288384
60654402457,Retail trade,171,86.2293852509412,Lacus Quisque Imp...,digital goods: bo...,b,3.52,-4.008149999537232,0.0794356725146198,21.9,119,208,92.75798446706048
19839532017,Retail trade,674,157.0,Pellentesque Habi...,"cable, satellite,...",b,4.94,-0.8502169050815052,0.0852284866468842,21.9,484,821,980.876026
38986645707,Retail trade,38,943.1286654741962,Lectus Justo LLC,bicycle shops - s...,c,1.72,0.27008298844572065,0.0895131578947368,21.9,30,46,117.12149019324757


In [37]:
full_data = full_data.withColumn("expected_transaction", (F.col("total_transactions_count")*
                                 (1+F.col("annual_turnover_percentage")/100)).cast("int"))

In [50]:
full_data = full_data.withColumn("expected_revenue", \
                        ((F.col("expected_transaction")*F.col("ap_percentage_by_gender")).cast("int")\
                        *F.col("avg_total_value")+(F.col("expected_transaction")*0.07*\
                            F.col("avg_total_value")))*F.col("take_rate")/100)


In [51]:
ranking = full_data.select("merchant_abn", "name", "business_area", "expected_revenue")

In [52]:
ranking.orderBy(F.col("expected_revenue").desc())

merchant_abn,name,business_area,expected_revenue
79827781481,Amet Risus Inc.,"furniture, home f...",108018.3380069306
32361057556,Orci In Consequat...,"gift, card, novel...",105036.50024917642
38700038932,Etiam Bibendum In...,tent and awning s...,101831.02658068771
48534649627,Dignissim Maecena...,"opticians, optica...",100859.6176281757
45629217853,Lacus Consulting,"gift, card, novel...",98872.32144781436
96680767841,Ornare Limited,motor vehicle sup...,97657.09041580516
21439773999,Mauris Non Institute,"cable, satellite,...",96867.2069639873
63123845164,Odio Phasellus In...,artist supply and...,94992.92399436876
64403598239,Lobortis Ultrices...,music shops - mus...,94273.21732778348
89726005175,Est Nunc Consulting,tent and awning s...,90160.8470240074


In [45]:
busi_area_type = []