In [82]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

In [83]:
data_loc = "../data/scores/"

transact = spark.read.option("header", "true").parquet(data_loc+"transactions.parquet").select(F.col("merchant_name").alias("merch"), "score")
fraud = spark.read.option("header", "true").parquet(data_loc+"fraud.parquet").select("merchant_name", "category", "fraud_group")

In [84]:
transact

merch,score
A Aliquet Ltd,1.7056672760511884
A Arcu Industries,4.206581352833638
A Arcu Sed Company,2.1352833638025595
A Arcu Sed Corpor...,0.0329067641681901
A Associates,0.1261425959780621
A Auctor Non Corp...,19.36197440585009
A Auctor Non Inst...,1.323583180987203
A Company,0.0255941499085923
A Consulting,0.2303473491773309
A Corp.,8.186471663619743


In [85]:
fraud = fraud.groupBy("merchant_name", "category").agg(F.mean("fraud_group"))
fraud = fraud.select("merchant_name", "category", F.col("avg(fraud_group)").alias("fraud"))
fraud = fraud.withColumn("fraud_score", 1 - F.col("fraud")).drop("fraud").withColumnRenamed("fraud_score", "fraud")
fraud

                                                                                

merchant_name,category,fraud
Risus Odio Auctor...,fashion and acces...,0.993423329871928
Sodales Mauris Co...,fashion and acces...,0.9972972972972972
Accumsan Convalli...,books and music,0.994937563280459
Aliquam Auctor Ve...,outdoors,0.9945553539019965
Lobortis Risus LLP,outdoors,0.9951456310679612
Pellentesque Indu...,outdoors,0.9926739926739928
Vestibulum Mauris...,home and technology,0.6272040302267002
Sit LLP,art and gifts,0.9919743178170144
Luctus Institute,books and music,0.95
Curabitur Massa LLC,outdoors,0.9845559845559846


In [86]:
join = fraud.join(transact, fraud.merchant_name == transact.merch).drop("merch").withColumnRenamed("score", "transact")
join = join.withColumn("score", F.col("fraud") * F.col("transact"))
join

merchant_name,category,fraud,transact,score
Risus Odio Auctor...,fashion and acces...,0.993423329871928,8.86837294332724,8.810048579906258
Sodales Mauris Co...,fashion and acces...,0.9972972972972972,2.241316270566728,2.2352586590246557
Accumsan Convalli...,books and music,0.994937563280459,17.97074954296161,17.879773760597647
Aliquam Auctor Ve...,outdoors,0.9945553539019965,1.6837294332723949,1.6745621223834344
Lobortis Risus LLP,outdoors,0.9951456310679612,1.2321755027422303,1.2261940682628991
Pellentesque Indu...,outdoors,0.9926739926739928,0.4149908592321755,0.4119506331572145
Vestibulum Mauris...,home and technology,0.6272040302267002,0.5923217550274223,0.3715065919441515
Sit LLP,art and gifts,0.9919743178170144,1.9378427787934187,1.922290268530229
Luctus Institute,books and music,0.95,0.0603290676416819,0.0573126142595978
Curabitur Massa LLC,outdoors,0.9845559845559846,0.4076782449725777,0.4013820558610321


In [87]:
cat = list(join.select("category").distinct().toPandas()["category"])
cat

['outdoors',
 'home and technology',
 'fashion and accessories',
 'books and music',
 'art and gifts']

In [88]:
outdoors = join.filter(F.col("category") == "outdoors")
outdoors.sort(desc("score")).limit(10)

merchant_name,category,fraud,transact,score
Erat Vitae LLP,outdoors,0.9941454012904296,440.54478976234,437.9655768046894
Pede Nonummy Corp.,outdoors,0.9944359443364356,396.8976234003656,394.68926293102953
Non Vestibulum In...,outdoors,0.994485427793444,377.34917733089577,375.26825804542
Est Nunc Consulting,outdoors,0.9941842697461636,328.6563071297989,326.74493070131
Lorem Ipsum Sodal...,outdoors,0.994286140905358,306.15722120658137,304.4078819837998
Euismod Urna Inst...,outdoors,0.994486631357284,119.21206581352834,118.55480574803865
Rhoncus Donec Ass...,outdoors,0.9948346955845452,103.43144424131629,102.89718934567972
Dui Nec Corporation,outdoors,0.994019315396202,103.3473491773309,102.7292612772627
Porttitor Tellus ...,outdoors,0.9942621240573488,102.0292504570384,101.44381927539422
Eget Metus In Cor...,outdoors,0.994040853939487,99.28153564899452,98.68990247695014


In [89]:
home = join.filter(F.col("category") == "home and technology")
home.sort(desc("score"))

merchant_name,category,fraud,transact,score
Placerat Eget Ven...,home and technology,0.9942652052249124,275.57404021937845,273.99367965337854
Mauris Non Institute,home and technology,0.9941721173534946,183.78427787934183,182.71320467558832
Interdum Feugiat ...,home and technology,0.9925698625907056,78.96709323583181,78.38035688227701
Vel Est Tempor LLP,home and technology,0.994416400535696,73.92870201096892,73.51591375002377
Eget Laoreet Posu...,home and technology,0.9935382121656846,69.56672760511883,69.11720217100694
Phasellus At Limited,home and technology,0.9938053503418528,66.22120658135283,65.81098940664157
Libero Et Limited,home and technology,0.9933189046767668,64.81901279707495,64.38595079381982
Amet Consulting,home and technology,0.9939879759519038,64.53016453382084,64.1422076328159
Arcu Sed Eu Incor...,home and technology,0.9935764117414768,58.29798903107861,57.92350675324307
Eleifend PC,home and technology,0.9941151408638228,57.44424131627056,57.10619004793974


In [90]:
fashion = join.filter(F.col("category") == "fashion and accessories")
fashion.sort(desc("score")).limit(10)

merchant_name,category,fraud,transact,score
Leo In Consulting,fashion and acces...,0.9941873044862274,415.47897623400365,413.0639234527814
Suspendisse Dui C...,fashion and acces...,0.9944462217672276,356.89945155393053,354.9173111486019
Dignissim Maecena...,fashion and acces...,0.9940244141065973,101.54844606946985,100.94163460764014
Gravida Mauris In...,fashion and acces...,0.9934301787916152,79.22303473491773,78.70255356112368
Euismod Et Institute,fashion and acces...,0.9934775299787068,68.25045703839122,67.80529547841874
Tellus Id LLC,fashion and acces...,0.9938012352610892,67.38574040219379,66.96803205068328
Ultricies Sem Lim...,fashion and acces...,0.994281753595564,61.76782449725777,61.41462085691649
Iaculis LLC,fashion and acces...,0.9937021362353888,60.57221206581353,60.19073652630191
Euismod Enim LLC,fashion and acces...,0.9944288929139908,55.35100548446069,55.042639105588485
Ipsum Primis Asso...,fashion and acces...,0.9944759906184422,49.50091407678245,49.22747056302661


In [91]:
books = join.filter(F.col("category") == "books and music")
books.sort(desc("score")).limit(10)

merchant_name,category,fraud,transact,score
Lobortis Ultrices...,books and music,0.9935354887575631,172.9908592321755,171.8725578778303
Nullam Consulting,books and music,0.9942725781257676,155.42413162705668,154.5339520557923
Suspendisse Ac As...,books and music,0.9942116060961314,104.06398537477148,103.46162203621589
Ornare Fusce Inc.,books and music,0.9941222785783604,83.27056672760511,82.7811255337582
Eros Limited,books and music,0.9934677700508266,83.308957952468,82.76476468229647
Euismod In Corp.,books and music,0.993442987330518,68.29433272394881,67.84652591902406
Feugiat Sed Nec I...,books and music,0.993625994440717,63.48628884826326,63.08162689020618
Ac Eleifend Corp.,books and music,0.9935133194882444,59.3418647166362,58.95693299924756
Vitae Odio Limited,books and music,0.9935686015831134,55.3382084095064,54.98230634354816
Et Nunc Consulting,books and music,0.9932967702620352,52.49725776965265,52.14535659020952


In [92]:
art = join.filter(F.col("category") == "art and gifts")
art.sort(desc("score")).limit(10)

merchant_name,category,fraud,transact,score
Lacus Consulting,art and gifts,0.994663956295261,347.9232175502742,346.06668405553256
Ipsum Dolor Sit C...,art and gifts,0.9947724039829304,299.87568555758685,298.30805661815
Vehicula Pellente...,art and gifts,0.9942653236270866,277.2870201096892,275.69686878695063
Dictum Phasellus ...,art and gifts,0.9942123094297008,151.20475319926874,150.3296268749929
Phasellus At Company,art and gifts,0.9938221837018524,151.09140767824496,150.15799271738024
Ultricies Digniss...,art and gifts,0.9941348973607038,138.3473491773309,137.5359277745313
Orci In Consequat...,art and gifts,0.9943694481359726,130.80073126142597,130.0642509602058
Faucibus Leo Ltd,art and gifts,0.9940760406554,76.77879341864717,76.32395896790766
Tristique Pellent...,art and gifts,0.993088071348941,54.60511882998172,54.22769214464629
Id Erat Etiam Con...,art and gifts,0.9939752315073078,54.40585009140768,54.07806743995883
