In [243]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler
from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank
from pyspark.ml.functions import vector_to_array
from pyspark.ml.feature import VectorAssembler
import numpy as np


spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [244]:
df_old = spark.read.parquet('../data/curated/merchant_consumer_abs')

                                                                                

In [245]:
df_old

                                                                                

postcode,total_earners,median_age,income_sum,2021_population,km2,income_mean,persons/km2,merchant_name,revenue_level,user_id,order_datetime,products,take_rate,tag,category,dollar_value,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender,fraud_group,__index_level_0__
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Elit Sed Consequa...,a,10413,2022-04-27,artist supply and...,5.89,artist supply craft,art and gifts,375.16773164703153,2022,4,27,Kelly Clayton,4211 Rodney Tunne...,NT,800,Female,0,0
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Elit Sed Consequa...,a,22246,2022-10-07,artist supply and...,5.89,artist supply craft,art and gifts,617.8791313462011,2022,10,7,Corey Estrada,1703 Boyd Shore,NT,800,Male,0,1
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Elit Sed Consequa...,a,2681,2022-02-28,artist supply and...,5.89,artist supply craft,art and gifts,766.5077067424303,2022,2,28,Nicole Bishop,467 Robert Island...,NT,800,Female,0,2
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Malesuada Vel Ltd,b,13454,2021-07-24,books periodicals...,3.56,books periodicals...,books and music,258.8659589921876,2021,7,24,Natalie Herrera,88798 Saunders Hi...,NT,800,Female,0,3
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Varius Orci Insti...,a,10146,2021-08-08,tent and awning s...,6.3,tent awning,outdoors,4.159037931172075,2021,8,8,Shannon Mann,00817 Owens Circle,NT,800,Female,0,4
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Volutpat Nulla In...,a,23093,2022-10-11,furniture home fu...,6.59,furniture home fu...,home and technology,8.26105405068297,2022,10,11,Robert Walker,646 Allison Ranch...,NT,800,Male,0,5
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Mollis Integer Co...,b,13454,2022-05-02,digital goods boo...,3.96,digital goods boo...,books and music,83.63154755239155,2022,5,2,Natalie Herrera,88798 Saunders Hi...,NT,800,Female,0,6
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Elit Sed Consequa...,a,13454,2021-04-08,artist supply and...,5.89,artist supply craft,art and gifts,87.00495112586256,2021,4,8,Natalie Herrera,88798 Saunders Hi...,NT,800,Female,0,7
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Eros Limited,c,8092,2021-12-18,digital goods boo...,2.52,digital goods boo...,books and music,8.26187754879106,2021,12,18,Rebecca Owens,92579 Keith Neck ...,NT,800,Female,0,8
800,5631.999973601934,36.75,420609029.345557,7678.999968193022,3.199999994700948,74682.00129918633,2399.687494034091,Hendrerit A Corpo...,a,11203,2022-07-14,watch clock and j...,6.64,watch clock jewel...,fashion and acces...,100.80643267043833,2022,7,14,John Rodriguez,563 Wallace Rue,NT,800,Male,0,9


In [246]:
df_old.select(F.col("fraud_group") == 2).count()

11920453

## 1. Calculate the fraud score for each merchants

In [247]:
# The highest possible mean value of fraud is 2
# Therefore we divide the mean value by two to keep it within [-1, 1]
df_fraud = df_old.groupBy(["merchant_name"]).agg((1-(mean("fraud_group")/2)).alias("fraud_score"))

In [248]:
df_fraud.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- fraud_score: double (nullable = true)



In [249]:
df_fraud.orderBy(F.col("fraud_score"))

                                                                                

merchant_name,fraud_score
Nulla Ltd,0.5
Mauris Ipsum Port...,0.5
At Lacus LLP,0.75
Eu Accumsan Sed I...,0.875
Augue Limited,0.875
Sem Magna Nec Ind...,0.9
Velit Cras Lorem ...,0.9117647058823528
Ut Ipsum Incorpor...,0.9210526315789472
Faucibus Leo Corp.,0.9444444444444444
In Mi PC,0.9444444444444444


## 2. Calculate the revenue score


In [250]:
df_predict = spark.read.parquet("../data/meta/predictions.parquet")
df_info = spark.read.parquet('../data/meta/merchant_metadata.parquet')

AnalysisException: Path does not exist: file:/Users/dongmenghan/Desktop/2022 yr3 s2/ads/project 2/generic-buy-now-pay-later-project-group-27/data/meta/predictions.parquet

In [None]:
df_category = (df_old.groupBy("merchant_name", "category").agg(max("take_rate").alias("take_rate"))).withColumnRenamed("merchant_name", "key")
df_revenue  = df_predict.groupBy(["merchant_name"]).agg(sum("prediction").alias("total_revenue"))
df_revenue = df_revenue.join(df_category, df_category.key ==  df_revenue.merchant_name,"inner").drop("key")

In [None]:
df_revenue

                                                                                

merchant_name,total_revenue,category,take_rate
Aliquam Auctor Ve...,187581.1319837608,outdoors,3.09
Accumsan Convalli...,1821808.2016016329,books and music,6.09
Lobortis Risus LLP,198914.5436484481,outdoors,4.53
Risus Odio Auctor...,614975.8978014218,fashion and acces...,5.99
Pellentesque Indu...,91228.16134131032,outdoors,2.66
Sodales Mauris Co...,190058.95977413544,fashion and acces...,1.5
Maecenas Iaculis ...,102660.30914673852,home and technology,1.5
Sit LLP,61322.97518850584,art and gifts,5.65
Curabitur Massa LLC,28413.476302695544,outdoors,3.61
Congue A Aliquet LLC,48167.49225988937,books and music,4.07


In [None]:
df_revenue = df_revenue.withColumn("revenue_taken", F.col("total_revenue") * (F.col("take_rate") / 100)).drop("total_revenue", "take_rate")
df_revenue

                                                                                

merchant_name,category,revenue_taken
Aliquam Auctor Ve...,outdoors,5796.256978298208
Accumsan Convalli...,books and music,110948.11947753944
Lobortis Risus LLP,outdoors,9010.8288272747
Risus Odio Auctor...,fashion and acces...,36837.056278305165
Pellentesque Indu...,outdoors,2426.6690916788552
Sodales Mauris Co...,fashion and acces...,2850.884396612032
Maecenas Iaculis ...,home and technology,1539.9046372010775
Sit LLP,art and gifts,3464.74809815058
Curabitur Massa LLC,outdoors,1025.7264945273091
Congue A Aliquet LLC,books and music,1960.4169349774968


In [None]:
revenue_score = df_revenue.withColumn("a", lit("a"))

# Use the rank as the score
Window_merchant = Window.partitionBy("a").orderBy("revenue_taken")
Window_category = Window.partitionBy("category").orderBy("revenue_taken")
revenue_score = revenue_score.withColumn("merchant_rev_score", percent_rank().over(Window_merchant)) 
revenue_score = revenue_score.withColumn("category_rev_score", percent_rank().over(Window_category)) 
revenue_score = revenue_score.drop("a", "revenue_taken")

In [None]:
revenue_score

                                                                                

merchant_name,category,merchant_rev_score,category_rev_score
Amet Lorem Consul...,art and gifts,0.0,0.0
Tincidunt Aliquam...,art and gifts,2.568053415511042...,0.001821493624772...
Amet Ultricies Se...,art and gifts,2.568053415511042...,0.001821493624772...
Vulputate Company,art and gifts,7.704160246533128E-4,0.00546448087431694
Eleifend Nunc LLP,art and gifts,0.001027221366204417,0.007285974499089253
Vehicula Pellente...,art and gifts,0.001284026707755...,0.009107468123861567
Interdum Curabitu...,art and gifts,0.001540832049306...,0.01092896174863388
Libero Integer PC,art and gifts,0.001797637390857...,0.012750455373406194
Nisl Arcu Iaculis...,art and gifts,0.002054442732408834,0.014571948998178506
Penatibus Et Magn...,art and gifts,0.002311248073959...,0.01639344262295082


Union the the fraud and revenue

In [None]:
df_fraud = df_fraud.withColumnRenamed("merchant_name", "key")
final_score = revenue_score.join(df_fraud, df_fraud.key ==  revenue_score.merchant_name,"inner").drop("key")
final_score

                                                                                

merchant_name,category,merchant_rev_score,category_rev_score,fraud_score
Amet Lorem Consul...,art and gifts,0.0,0.0,1.0
Tincidunt Aliquam...,art and gifts,2.568053415511042...,0.001821493624772...,1.0
Amet Ultricies Se...,art and gifts,2.568053415511042...,0.001821493624772...,0.9966666666666668
Vulputate Company,art and gifts,7.704160246533128E-4,0.00546448087431694,1.0
Eleifend Nunc LLP,art and gifts,0.001027221366204417,0.007285974499089253,1.0
Vehicula Pellente...,art and gifts,0.001284026707755...,0.009107468123861567,0.9923076923076924
Interdum Curabitu...,art and gifts,0.001540832049306...,0.01092896174863388,1.0
Libero Integer PC,art and gifts,0.001797637390857...,0.012750455373406194,0.9978540772532188
Nisl Arcu Iaculis...,art and gifts,0.002054442732408834,0.014571948998178506,0.9974226804123713
Penatibus Et Magn...,art and gifts,0.002311248073959...,0.01639344262295082,1.0


### 3. Potential ability

In [None]:
trend = spark.read.option("header", "true").csv("../data/curated/increasing.csv")
trend = trend.withColumn("monthly_increase_rate", F.col("monthly_increase_rate").cast("double"))

In [None]:
trend_positive = trend.filter(F.col("monthly_increase_rate") >= 0)
trend_negative = trend.filter(F.col("monthly_increase_rate") < 0)

In [None]:

# VectorAssembler Transformation - Converting column to vector type
assembler_trend = VectorAssembler(inputCols=["monthly_increase_rate"],outputCol="increasing_Vect")
df_trend = assembler_trend.transform(trend)
scaler = MinMaxScaler(inputCol="increasing_Vect", outputCol="trend_score")

# Standardise the trend score (since it contain negative value) as score
df_trend = scaler.fit(df_trend).transform(df_trend).drop("monthly_increase_rate","increasing_Vect").withColumn("trend_score", (2*(vector_to_array("trend_score")[0])-1))




In [None]:
df_trend = df_trend.withColumnRenamed("merchant_name","key")
final_score = final_score.join(df_trend, df_trend.key ==  revenue_score.merchant_name,"inner").drop("key")
final_score

                                                                                

merchant_name,category,merchant_rev_score,category_rev_score,fraud_score,trend_score
Amet Lorem Consul...,art and gifts,0.0,0.0,1.0,-0.2851362918686249
Tincidunt Aliquam...,art and gifts,2.568053415511042...,0.001821493624772...,1.0,-0.2919687621827521
Amet Ultricies Se...,art and gifts,2.568053415511042...,0.001821493624772...,0.9966666666666668,-0.2870785906746487
Vulputate Company,art and gifts,7.704160246533128E-4,0.00546448087431694,1.0,-0.2746884966991739
Eleifend Nunc LLP,art and gifts,0.001027221366204417,0.007285974499089253,1.0,-0.2850052340899205
Vehicula Pellente...,art and gifts,0.001284026707755...,0.009107468123861567,0.9923076923076924,-0.2787480220618894
Interdum Curabitu...,art and gifts,0.001540832049306...,0.01092896174863388,1.0,-0.2854461689839598
Libero Integer PC,art and gifts,0.001797637390857...,0.012750455373406194,0.9978540772532188,-0.2746415327823409
Nisl Arcu Iaculis...,art and gifts,0.002054442732408834,0.014571948998178506,0.9974226804123713,-0.2813403025210856
Penatibus Et Magn...,art and gifts,0.002311248073959...,0.01639344262295082,1.0,-0.2740012225049318


In [None]:
df_trend.sort(F.col("trend_score"))

key,trend_score
Rutrum Magna Cras...,-1.0
Sed Nec Inc.,-0.91774036166771
Pharetra Corp.,-0.8334200386655441
Fames Ac Turpis LLC,-0.6752184490530835
Amet Risus Inc.,-0.6596795683980416
Dignissim Lacus PC,-0.6399996588754957
In Lobortis Tellu...,-0.6257353623479553
Id Enim Inc.,-0.625085997402387
Ut Ipsum LLC,-0.6037409750782243
At Pede Inc.,-0.5833355926027


### 4. Stability

In [None]:
df_variance = df_old.groupBy(["merchant_name", "order_year", "order_month"]).agg(
    sum("dollar_value").alias("dollar_value")
    )

In [None]:
fill = {'2021': range(3,13),
        '2022': range(1,11)}
merchs = df_variance.select("merchant_name").distinct()
cols = ["order_year", "order_month", "dollar_value"]
vals = [(int(year), month, 0) for year in fill for month in fill[year]]
months = spark.createDataFrame(vals, cols)
months = merchs.join(months).select(df_variance.columns)
variance_score = df_variance.union(months)

In [None]:
variance_score

                                                                                

merchant_name,order_year,order_month,dollar_value
Lacinia At Institute,2022,8,11473.893084260102
Phasellus At Limited,2021,6,365871.4302617312
Eget Laoreet Posu...,2022,8,179542.76522072856
Eu Tempor Ltd,2021,4,5001.601112401771
Tempor LLC,2021,12,10105.609794197258
Semper Tellus PC,2022,6,36826.62851667091
Ac Institute,2021,6,34412.903467209915
Mattis Velit Just...,2021,3,98927.17618829764
Lacinia Mattis LLC,2022,5,55866.78480219141
Aliquam Gravida M...,2021,12,50557.11317859411


In [None]:
variance_score = variance_score.groupBy(["merchant_name", "order_year", "order_month"]).agg(
    sum("dollar_value").alias("dollar_value")
    )

In [None]:
variance_score = variance_score.groupBy(["merchant_name"]).agg(
    variance("dollar_value").alias("variance")
    )

In [None]:
variance_score

                                                                                

merchant_name,variance
Malesuada Vel Ltd,4456926.807088791
Mollis Integer Co...,45820094.08526634
Volutpat Nulla In...,196244569.09507588
Donec Luctus Indu...,332362806.4942223
Vulputate Velit E...,725151.9694819615
Erat Semper Ltd,123018000.75294748
Hendrerit Consect...,17210253.143555377
Vel Turpis Company,4287985.304659572
Curabitur Vel LLC,333323.39801631885
Malesuada PC,1973086.758359104


In [None]:
variance_score = variance_score.fillna(value = 0)

In [None]:
# VectorAssembler Transformation - Converting column to vector type
assembler_vec = VectorAssembler(inputCols=["variance"],outputCol="variance_Vect")
variance_score = assembler_vec.transform(variance_score)

scaler = MinMaxScaler(inputCol="variance_Vect", outputCol="variance_score")


# Standardise the trend score (since it contain negative value) as score
from pyspark.ml.functions import vector_to_array
variance_score = scaler.fit(variance_score).transform(variance_score).drop("variance","variance_Vect").withColumn("variance_score", vector_to_array("variance_score")[0])


                                                                                

In [None]:
variance_score = variance_score.withColumnRenamed("merchant_name","key")
final_score = final_score.join(variance_score, variance_score.key ==  revenue_score.merchant_name,"inner").drop("key")
final_score

                                                                                

merchant_name,category,merchant_rev_score,category_rev_score,fraud_score,trend_score,variance_score
Malesuada Vel Ltd,books and music,0.6782229070364664,0.6928657799274486,0.9968982630272952,-0.2659155054587914,3.298045669878045E-4
Mollis Integer Co...,books and music,0.886748844375963,0.8887545344619106,0.9988385598141696,-0.2263466938273277,0.003390957761643...
Volutpat Nulla In...,home and technology,0.9119157678479712,0.9227600411946448,0.9984037034080931,-0.248452214618109,0.014523382876334779
Donec Luctus Indu...,fashion and acces...,0.6599897277863379,0.5835654596100278,0.9975812953507122,-0.2821378659840347,0.02459704997399887
Vulputate Velit E...,fashion and acces...,0.2894196199280945,0.2005571030640668,1.0,-0.2862421418778321,5.362807428715451...
Erat Semper Ltd,outdoors,0.8877760657421674,0.8921212121212121,0.9973759541984732,-0.2705380856818037,0.009104123221804636
Hendrerit Consect...,books and music,0.8002054442732409,0.8234582829504232,0.9981889105600446,-0.3230723102625034,0.001273636682038701
Vel Turpis Company,home and technology,0.4740626605033384,0.518022657054583,0.9953416149068324,-0.3488128370442415,3.173017570266328...
Curabitur Vel LLC,books and music,0.2570621468926554,0.2164449818621523,1.0,-0.2796712302846442,2.463011880582002E-5
Malesuada PC,outdoors,0.5192604006163328,0.5212121212121212,0.997093023255814,-0.3102605606400732,1.459836605990655...


In [None]:
final_score.count()

                                                                                

3895

In [None]:
final_score.write.parquet("../data/curated/score_table")

                                                                                