In [31]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *
import os
import csv

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [32]:

score = spark.read.parquet("../data/curated/score_table")

                                                                                

In [33]:
score.printSchema()

root
 |-- merchant_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- merchant_rev_score: double (nullable = true)
 |-- category_rev_score: double (nullable = true)
 |-- fraud_score: double (nullable = true)
 |-- trend_score: double (nullable = true)
 |-- variance_score: double (nullable = true)



In [34]:
rank = score.withColumn("score_merchant", 
        col("merchant_rev_score")*col("fraud_score")*0.7+
        col("variance_score")*0.3 + col("trend_score")*0.2)
rank = rank.withColumn("score_category", 
        col("category_rev_score")*col("fraud_score")*0.7+
        col("variance_score")*0.3 + col("trend_score")*0.2)

In [35]:
rank = rank.select("merchant_name","category","score_merchant", "score_category").sort(desc("score_merchant"))

In [36]:
rank.write.csv("../data/result")

                                                                                

### Top 100 merchants

In [37]:
rank.sort(desc("score_merchant")).limit(100)

merchant_name,category,score_merchant,score_category
Diam Eu Dolor LLC,outdoors,1.0875684407373454,1.0913805052755747
Ornare Limited,outdoors,1.016399241920415,1.0171169529219697
Arcu Morbi Institute,art and gifts,1.0163948048627651,1.014985265625849
Iaculis Aliquet D...,outdoors,0.9628041727888778,0.961828672652549
Leo In Consulting,fashion and acces...,0.9457595545679616,0.9454281204859144
Nec Tellus Ltd,fashion and acces...,0.936717578076056,0.9350759692370748
Dictum Phasellus ...,art and gifts,0.934123188980306,0.9311681803167092
Suspendisse Non L...,outdoors,0.9336441678833632,0.9319652989642738
Arcu Sed Eu Incor...,home and technology,0.9335345770035204,0.9342284828626074
Auctor Company,outdoors,0.9231643504938236,0.9235160497536962


### Top 10 for each category

In [38]:
cat = list(rank.select("category").distinct().toPandas()["category"])
cat

['outdoors',
 'home and technology',
 'fashion and accessories',
 'books and music',
 'art and gifts']

In [39]:
outdoors = rank.filter(F.col("category") == "outdoors")
outdoors.sort(desc("score_category")).limit(10)

merchant_name,category,score_merchant,score_category
Diam Eu Dolor LLC,outdoors,1.0875684407373454,1.0913805052755747
Ornare Limited,outdoors,1.016399241920415,1.0171169529219697
Iaculis Aliquet D...,outdoors,0.9628041727888778,0.961828672652549
Suspendisse Non L...,outdoors,0.9336441678833632,0.9319652989642738
Auctor Company,outdoors,0.9231643504938236,0.9235160497536962
Est Nunc Consulting,outdoors,0.9045235274496832,0.9030518223078444
Lorem Ipsum Sodal...,outdoors,0.8996382947192149,0.899322413663958
Erat Vitae LLP,outdoors,0.8703719643611981,0.871793390217532
Pede Nonummy Corp.,outdoors,0.8295962503714107,0.832475246670172
Non Vestibulum In...,outdoors,0.821570078941968,0.8196101014206156


In [40]:
home = rank.filter(F.col("category") == "home and technology")
home.sort(desc("score_category"))

merchant_name,category,score_merchant,score_category
Arcu Sed Eu Incor...,home and technology,0.9335345770035204,0.9342284828626074
Mauris Non Institute,home and technology,0.9193203420897996,0.9211131949105829
Interdum Feugiat ...,home and technology,0.9012295050358198,0.9038300670007298
Phasellus At Limited,home and technology,0.8919255048817771,0.894580167829437
Phasellus Dapibus...,home and technology,0.8713316650265482,0.8733886773540367
Neque Sed Dictum ...,home and technology,0.8706495142199309,0.8713413675185132
Adipiscing Elit F...,home and technology,0.8031407336004195,0.8039689270529469
Nullam Enim Ltd,home and technology,0.7902110677760092,0.7977387263053399
Placerat Eget Ven...,home and technology,0.7898679636772492,0.7912741729397526
Sed Diam Foundation,home and technology,0.7879870345075162,0.7890357179880334


In [41]:
fashion = rank.filter(F.col("category") == "fashion and accessories")
fashion.sort(desc("score_category")).limit(10)

merchant_name,category,score_merchant,score_category
Leo In Consulting,fashion and acces...,0.9457595545679616,0.9454281204859144
Nec Tellus Ltd,fashion and acces...,0.936717578076056,0.9350759692370748
Dignissim Maecena...,fashion and acces...,0.9107214920812337,0.9099276024196704
Gravida Mauris In...,fashion and acces...,0.8519567727687615,0.8497613800681746
Dolor Quisque Inc.,fashion and acces...,0.8223619123004936,0.8192019071009945
Suspendisse Dui C...,fashion and acces...,0.8024872501699538,0.8007814231834953
Iaculis LLC,fashion and acces...,0.7471513190188319,0.7436027236684143
Quam Elementum Corp.,fashion and acces...,0.7033251099161467,0.6994806840504677
Blandit At LLC,fashion and acces...,0.6948290629667331,0.6954224158001463
Magna Institute,fashion and acces...,0.6816326835341475,0.6712394009639159


In [42]:
books = rank.filter(F.col("category") == "books and music")
books.sort(desc("score_category")).limit(10)

merchant_name,category,score_merchant,score_category
Lobortis Ultrices...,books and music,0.8995566368080294,0.900969572524783
Placerat Orci Ins...,books and music,0.8564326670015582,0.8627186372207024
Tempor Est Founda...,books and music,0.8312832957485824,0.8375081713853588
Euismod In Corp.,books and music,0.7996355880834806,0.803904172722113
Pretium Et LLC,books and music,0.7947301515240902,0.7965223960027754
Ornare Fusce Inc.,books and music,0.7889271607616435,0.7934957984249835
Dignissim Lacus PC,books and music,0.7945411447763896,0.7907881676108762
Nullam Consulting,books and music,0.7810705085923072,0.7859535595680842
Suspendisse Ac As...,books and music,0.7757301930334065,0.7765444302148056
Sodales At LLC,books and music,0.745365414586944,0.7464940372308507


In [44]:
art = rank.filter(F.col("category") == "art and gifts")
art.sort(desc("score_category")).limit(10)

merchant_name,category,score_merchant,score_category
Arcu Morbi Institute,art and gifts,1.0163948048627651,1.014985265625849
Dictum Phasellus ...,art and gifts,0.934123188980306,0.9311681803167092
Aliquam Auctor As...,art and gifts,0.9173904597822056,0.9167353464454332
Phasellus At Company,art and gifts,0.9070904242621666,0.9064187387447048
Magna Malesuada C...,art and gifts,0.9080128762170308,0.9042423959190866
Orci In Consequat...,art and gifts,0.8979977583249472,0.8963490396053186
Ultricies Digniss...,art and gifts,0.8794747682068239,0.8768622438420528
Lacus Consulting,art and gifts,0.8628623913307517,0.8621277306564481
Odio Phasellus In...,art and gifts,0.7826305755945498,0.7777688312147572
Ipsum Dolor Sit C...,art and gifts,0.7588789640452228,0.7601443459174498
