In [1]:
# create modeling spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('Project 2')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [2]:
from pyspark.sql import functions as F

In [3]:
data = spark.read.parquet('../data/curated/final_dataset')
data.limit(5)

merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,fraud_rate
34440496342,Manufacturing,191,89.59955556402387,Mauris Nulla Inte...,"opticians, optica...",c,2.85,0.2566671150366628,0.089651832460733,17.0,0.015706806282722512
83412691377,Other services,12516,34.89369140888509,Suspendisse Sagit...,"watch, clock, and...",c,2.94,-0.0115045642816724,0.0850199344838606,2.4,0.002796420581655481
35344855546,Other services,1330,86.73320563981618,Quis Tristique Ac...,"watch, clock, and...",c,2.92,0.0765624529048179,0.0842616541353383,2.4,0.003759398496240...
59128133246,Manufacturing,5607,30.12883220396019,Ut Tincidunt Foun...,"opticians, optica...",a,6.3,-0.0881600573104575,0.0857556625646513,17.0,0.003923666845015...
86662713230,Other services,18947,52.36669172726792,Vestibulum Accums...,"watch, clock, and...",a,6.41,0.1057907662613883,0.0850380007389032,2.4,0.003219507045970338


In [62]:
data.filter(F.col("merchant_abn")==89598973352)

merchant_abn,business_area_type,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,consumer_scaled_spare_money,ap_percentage_by_gender,annual_turnover_percentage,fraud_rate
89598973352,Retail trade,96,6260.185023263514,Ipsum LLC,antique shops - s...,b,4.79,-7.319626782183222,0.0851822916666666,21.9,0.3229166666666667


In [4]:
data.count()

3082

If we use annual transaction count to predict revenue, then run the following

In [12]:
clean_transaction = spark.read.parquet('../data/curated/clean_full_dataset')
annual_transaction_count = clean_transaction.filter(F.col("order_datetime")>"2021-08-27")
annual_merchant_transaction = annual_transaction_count.groupby("merchant_abn").count()
full_data = data.join(annual_merchant_transaction, on="merchant_abn", how="left")

In [63]:
full_data = full_data.withColumn("ap_rate", (F.col("consumer_scaled_spare_money")*0.01+F.col("ap_percentage_by_gender").cast("float")))

In [64]:
# cal
full_data = full_data.withColumn("expected_transaction", (F.col("total_transactions_count")*
                                 (1+F.col("annual_turnover_percentage")/100)).cast("int"))

In [65]:
full_data = full_data.withColumn("expected_revenue", \
                        ((F.col("expected_transaction")*F.col("ap_percentage_by_gender")).cast("int")\
                        *F.col("avg_total_value")+(F.col("expected_transaction")*0.077*\
                            F.col("avg_total_value")))*F.col("take_rate")/100)


In [66]:
full_data = full_data.withColumn("expected_revenue1", \
                        ((F.col("expected_transaction")*1.077*F.col("ap_rate")).cast("int")\
                        *F.col("avg_total_value")*F.col("take_rate")/100))



In [67]:
ranking = full_data.select("merchant_abn", "name", "business_area", "expected_revenue", "expected_revenue1",\
                          "fraud_rate", "ap_rate")

In [68]:
ranking.orderBy(F.col("expected_revenue").desc(), F.col("fraud_rate").asc())

merchant_abn,name,business_area,expected_revenue,expected_revenue1,fraud_rate,ap_rate
79827781481,Amet Risus Inc.,"furniture, home f...",110799.73510359028,61388.51126937062,0.1638525564803805,0.0834472723212229
32361057556,Orci In Consequat...,"gift, card, novel...",107965.60938014746,61178.44798191932,0.003303451307394...,0.0853819109486937
38700038932,Etiam Bibendum In...,tent and awning s...,104239.30325807852,58515.97937124752,0.060731980182195944,0.084592413809592
48534649627,Dignissim Maecena...,"opticians, optica...",103566.79174104368,58139.044044915856,0.017240785394419565,0.0844865947678084
45629217853,Lacus Consulting,"gift, card, novel...",101633.9382354574,57703.08160806118,0.003114328917194...,0.0855289361703804
96680767841,Ornare Limited,motor vehicle sup...,100320.69815855534,56680.96952327896,0.009468237366508862,0.0851181392027642
21439773999,Mauris Non Institute,"cable, satellite,...",99595.57975972437,56291.96599762329,0.008380364380137116,0.0851807953535045
63123845164,Odio Phasellus In...,artist supply and...,97642.65953675505,54556.19035739352,0.015253386853988962,0.0842998733875946
64403598239,Lobortis Ultrices...,music shops - mus...,96883.0973593516,54760.20056395745,0.003824515142060...,0.0851725322707218
89726005175,Est Nunc Consulting,tent and awning s...,92604.81968481065,52603.74106295616,0.003419491099265815,0.0855931409043995


In [69]:
ranking.orderBy(F.col("expected_revenue1").asc())

merchant_abn,name,business_area,expected_revenue,expected_revenue1,fraud_rate,ap_rate
97638827169,Rutrum PC,telecom,352.3021584361444,-481.8356395297165,0.1481481481481481,-0.1819623361805084
66236596240,Ut Ipsum Incorpor...,tent and awning s...,125.84516786506998,-175.19048890729002,0.16,-0.1967169088779563
47393015569,Nonummy Ultricies...,"opticians, optica...",47.962671419290245,-49.45625017456202,0.0625,-0.14141206673811293
58380745308,Auctor Quis Trist...,health and beauty...,145.1555648495501,-14.508302333788118,0.015625,-0.01876701197276...
24976666868,Duis Dignissim In...,"hobby, toy and ga...",1777.0422321906165,0.0,0.1645569620253164,-0.00192333702210...
91744809343,Egestas Associates,"furniture, home f...",124.7152713194384,0.0,0.021505376344086,0.006584408230726604
46429297007,Magnis Dis Institute,art dealers and g...,63.175015238863615,0.0,0.2,0.08102068672724613
35235320738,Gravida Sit Incor...,antique shops - s...,175.13366686111775,0.0,0.3333333333333333,0.05702085923047812
54277261175,In Magna PC,antique shops - s...,162.5602393437372,0.0,0.3333333333333333,0.07852949094216193
16377211887,Gravida Mauris PC,"jewelry, watch, c...",109.48315121814932,0.0,0.625,0.07827686055047406


In [45]:
busi_area_type = []