In [1]:
# create modeling spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Project 2')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)
import pandas as pd
from pyspark.sql import functions as F
import pathlib

In [2]:
# create directory of rank folder
pathlib.Path("../data/curated/rank").mkdir(parents=True, exist_ok=True)

In [3]:
data = spark.read.parquet('../data/curated/final_dataset')

Use annual transaction count to predict future revenue

In [4]:
clean_transaction = spark.read.parquet('../data/curated/clean_full_dataset')
annual_transaction_count = clean_transaction.filter(F.col("order_datetime")>"2021-08-27")
annual_merchant_transaction = annual_transaction_count.groupby("merchant_abn").count().select("merchant_abn", "count")
full_data = data.join(annual_merchant_transaction, on="merchant_abn", how="left")


In [5]:
# calculate proportion of potential afterpay users
full_data = full_data.withColumn("ap_rate", (F.col("consumer_scaled_spare_money")*0.1+F.col("ap_percentage_by_gender").cast("float")))

In [6]:
busi_area_type = [
    "fashion", "fashion", "electronics", "recreation", 
    "recreation", "household", "recreation", "household",
    "recreation", "household", "electronics", "household",
    "recreation", "household", "household", "electronics",
    "fashion", "fashion", "recreation", "electronics",
    "recreation", "recreation", "household", "household", "household"]

busi_area= full_data.groupby('business_area').count().select("business_area").toPandas()['business_area'].to_list()


allocation_sdf=spark.createDataFrame(pd.DataFrame(list(zip(busi_area, busi_area_type)),
               columns =['business_area', 'business_segment']))

full_data = full_data.join(allocation_sdf, on = "business_area")

In [7]:
# calculate expected transaction counts for next year
full_data = full_data.withColumn("expected_transaction", (F.col("count")*
                                 (1+F.col("annual_turnover_percentage")/100)).cast("int"))

In [8]:
# there are ap_rate of less than 0, replace this rate by 0.001
full_data = full_data.withColumn("ap_rate", F.when(full_data["ap_rate"] < 0, 0.001).otherwise(full_data["ap_rate"]))

In [9]:
# afterpay user = transaction * (gender ap + 0.077)
full_data = full_data.withColumn("expected_revenue", \
                        ((F.col("expected_transaction")*F.col("ap_percentage_by_gender")).cast("int")\
                        *F.col("avg_total_value")+((F.col("expected_transaction")*0.077).cast("int")*\
                            F.col("avg_total_value")))*F.col("take_rate")/100)


In [10]:
# afterpay user = transaction*1.077*aprate
full_data = full_data.withColumn("expected_revenue1", \
                        ((F.col("expected_transaction")*1.077*F.col("ap_rate")).cast("int")\
                        *F.col("avg_total_value"))*F.col("take_rate")/100)

In [11]:
# after pay user = aprate + 0.077
full_data = full_data.withColumn("expected_revenue2", \
                                 (F.col("expected_transaction")*(0.077+F.col("ap_rate"))).cast("int")\
                                 *F.col("avg_total_value")*F.col("take_rate")/100)

In [12]:
full_data.limit(5)

business_area,merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,fraud_rate,count,ap_rate,business_segment,expected_transaction,expected_revenue,expected_revenue1,expected_revenue2
antique shops - s...,37935728745,Retail trade,9,11317.019602391396,Laoreet Lectus Qu...,b,4.12,0.0966893187534178,0.0922777777777777,21.9,0.4444444444444444,6,0.1019467120505508,household,7,0.0,0.0,466.26120761852553
antique shops - s...,46916077029,Retail trade,71,13522.620077171276,Mauris Aliquam LLC,a,6.8,0.2560244766281576,0.0872183098591549,21.9,0.4788732394366197,53,0.1128207546214911,household,64,8275.843487228822,6436.767156733527,11034.457982971762
antique shops - s...,21319642626,Retail trade,8,22661.382077424903,At Risus Associates,b,3.29,0.4466573950774365,0.0816875,21.9,0.5,6,0.1263532420707434,household,7,0.0,0.0,745.5594703472793
antique shops - s...,11024352823,Retail trade,379,214.29456949819507,Aliquet Metus Urn...,c,2.62,0.2651272370833886,0.0859828496042216,21.9,0.0026385224274406,275,0.1124955753924323,household,335,297.56943920519376,224.5807088341084,353.7146164137208
antique shops - s...,83173175334,Retail trade,16,13574.87446695437,Enim Gravida Inc.,a,5.81,0.5519064947172286,0.07596875,21.9,0.5625,11,0.1311593992929089,household,13,788.700206530049,788.700206530049,1577.400413060098


In [13]:
ranking = full_data.select("merchant_abn", "name", "business_area", "expected_revenue", "expected_revenue1",\
                          "expected_revenue2" ,"fraud_rate", "ap_rate", "revenue_level", "business_segment")

In [14]:
ranking = ranking.withColumn("true_revenue", (F.col("expected_revenue")*(1-F.col("fraud_rate"))))\
                .orderBy(F.col("true_revenue"))

In [15]:
ranking = ranking.drop("expected_revenue", "expected_revenue1","expected_revenue2", "fraud_rate", "ap_rate")

In [16]:
household_rank = ranking.filter(F.col("business_segment")=="household").orderBy(F.col("true_revenue").desc())
household_rank.write.mode("overwrite").parquet("../data/curated/rank/household_rank/")
household_rank

merchant_abn,name,business_area,revenue_level,business_segment,true_revenue
96680767841,Ornare Limited,motor vehicle sup...,a,household,71084.1441208355
38700038932,Etiam Bibendum In...,tent and awning s...,a,household,69455.09933560257
89726005175,Est Nunc Consulting,tent and awning s...,a,household,65618.73303178482
79827781481,Amet Risus Inc.,"furniture, home f...",a,household,64801.02826181595
49891706470,Non Vestibulum In...,tent and awning s...,a,household,50899.94176843741
76767266140,Phasellus At Limited,"furniture, home f...",b,household,50651.47914750627
43186523025,Lorem Ipsum Sodal...,florists supplies...,b,household,49421.94626067703
80551528183,Ac Ipsum LLC,tent and awning s...,b,household,49380.09663102982
90568944804,Diam Eu Dolor LLC,tent and awning s...,b,household,46965.81303945826
49549583265,Luctus Et Incorpo...,tent and awning s...,a,household,46937.21487505494


In [17]:
recreation_rank = ranking.filter(F.col("business_segment")=="recreation").orderBy(F.col("true_revenue").desc())
recreation_rank.write.mode("overwrite").parquet("../data/curated/rank/recreation_rank/")
recreation_rank

merchant_abn,name,business_area,revenue_level,business_segment,true_revenue
32361057556,Orci In Consequat...,"gift, card, novel...",a,recreation,76446.87711899739
45629217853,Lacus Consulting,"gift, card, novel...",a,recreation,72174.829764078
64403598239,Lobortis Ultrices...,music shops - mus...,a,recreation,68500.2505103527
63123845164,Odio Phasellus In...,artist supply and...,a,recreation,68349.43847567191
94493496784,Dictum Phasellus ...,"gift, card, novel...",a,recreation,62849.26031255052
40515428545,Elit Sed Consequa...,artist supply and...,a,recreation,55759.23542901116
72472909171,Nullam Consulting,digital goods: bo...,a,recreation,55577.03064490392
79417999332,Phasellus At Company,"gift, card, novel...",b,recreation,55497.97854727429
67978471888,Magna Malesuada C...,artist supply and...,a,recreation,55079.92450594881
98973094975,Ornare Fusce Inc.,"hobby, toy and ga...",a,recreation,50031.258585266165


In [18]:
electronic_rank = ranking.filter(F.col("business_segment")=="electronics").orderBy(F.col("true_revenue").desc())
electronic_rank.write.mode("overwrite").parquet("../data/curated/rank/electronic_rank/")
electronic_rank

merchant_abn,name,business_area,revenue_level,business_segment,true_revenue
21439773999,Mauris Non Institute,"cable, satellite,...",a,electronics,70123.88399183142
82368304209,Nec Incorporated,telecom,a,electronics,53660.30852958023
45433476494,Adipiscing Elit F...,"computers, comput...",a,electronics,51362.41505377121
35909341340,Arcu Sed Eu Incor...,computer programm...,b,electronics,51043.88829494652
94690988633,Eu Placerat LLC,"computers, comput...",a,electronics,47421.26098812279
58454491168,Diam At Foundation,computer programm...,a,electronics,45131.137903996176
80518954462,Neque Sed Dictum ...,"computers, comput...",b,electronics,38167.61308304784
67400260923,Eleifend PC,computer programm...,a,electronics,37435.247521033656
34096466752,Nullam Enim Ltd,"computers, comput...",b,electronics,36413.45306685639
77590625261,Sed Diam Foundation,computer programm...,b,electronics,27499.78277305351


In [19]:
fashion_rank = ranking.filter(F.col("business_segment")=="fashion").orderBy(F.col("true_revenue").desc())
fashion_rank.write.mode("overwrite").parquet("../data/curated/rank/fashion_rank/")
fashion_rank

merchant_abn,name,business_area,revenue_level,business_segment,true_revenue
48534649627,Dignissim Maecena...,"opticians, optica...",a,fashion,72370.66455459432
86578477987,Leo In Consulting,"watch, clock, and...",a,fashion,63172.729776809945
49322182190,Gravida Mauris In...,"watch, clock, and...",a,fashion,51242.15280876644
93558142492,Dolor Quisque Inc.,shoe shops,b,fashion,36666.83519879501
11439466003,Blandit At LLC,shoe shops,a,fashion,34025.18078647288
95574756848,At Pede Inc.,"opticians, optica...",a,fashion,29252.206391983826
99976658299,Sociosqu Corp.,shoe shops,a,fashion,27709.426412713656
19492220327,Commodo Ipsum Ind...,"jewelry, watch, c...",b,fashion,26997.774164823237
46804135891,Suspendisse Dui C...,"opticians, optica...",c,fashion,24242.32782135498
62224020443,Hendrerit A Corpo...,"watch, clock, and...",a,fashion,23303.30471029042
