In [1]:
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
spark = (
    # Create a spark session (which will run spark jobs)
    SparkSession.builder.appName("Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config('spark.executor.memory','10g')
    .config('spark.driver.memory','12g')
    .config('spark.driver.maxResultsSize', '10GiB')
    # .config("spark.network.timeout", "3600s")
    # .master("local[6]")
    .getOrCreate()
    )

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/08 22:28:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
consumer_fraud_sdf = read_file(spark, 'consumer_fraud.parquet', '../data/curated/')

|> Loading File...


                                                                                

|> Loading Finished!
-RECORD 0---------------------------------------------------------------------------------------------
 user_id           | 9010                                                                             
 consumer_id       | 1053884                                                                          
 take_rate         | 5.73                                                                             
 tagIndex          | 0.0                                                                              
 tag_vec           | (24,[0],[1.0])                                                                   
 count             | 1                                                                                
 dollar_average    | 2.2484627193298805                                                               
 dollar_min        | 2.2484627193298805                                                               
 dollar_max        | 2.2484627193298805             

In [5]:
merchant_fraud_sdf = read_file(spark, 'merchant_fraud.parquet', '../data/curated/')

|> Loading File...
|> Loading Finished!
-RECORD 0---------------------------------------------------------------------------------------------
 merchant_abn      | 43719937438                                                                      
 take_rate         | 5.01                                                                             
 tagIndex          | 4.0                                                                              
 tag_vec           | (24,[4],[1.0])                                                                   
 count             | 11                                                                               
 dollar_average    | 76.74733459026552                                                                
 dollar_min        | 27.621111402431843                                                               
 dollar_max        | 167.7734627758464                                                                
 order_year        | 2021        

In [6]:
trans_df = read_file(spark, 'changed_data', '../data/curated/')

|> Loading File...
|> Loading Finished!
22/10/08 22:29:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
-RECORD 0---------------------------------------------------------------------------------------------
 POA_CODE21        | 2040                                                                             
 user_id           | 11893                                                                            
 merchant_abn      | 90578415511                                                                      
 dollar_value      | 131.14483377730215                                                               
 order_id          | 1db43d10-b5f2-4afc-b94a-948cfa1879f1                                             
 merchant_name     | A Scelerisque Foundation                                                         
 tags              | furniture, home furnishings and equipm

In [7]:
from pyspark.sql.functions import year, month, dayofmonth

In [8]:
trans_df = (trans_df
               .withColumn('order_year', year(trans_df.order_datetime))
               .withColumn('order_month', month(trans_df.order_datetime))
               .withColumn('order_day', dayofmonth(trans_df.order_datetime))
               ).drop('order_datetime')

In [9]:
trans_df.show(5)

+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+-------------------+-----+------+-----------------+---------------+----------+------------+-----------------+------------------+------------------+-----------+---------+--------+---------------+----------+-----------+---------+
|POA_CODE21|user_id|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|            name|            address|state|gender|       population|New cases / day|     month|total_retail|__index_level_0__|          latitude|         longitude|genderIndex|typeIndex|tagIndex|        tag_vec|order_year|order_month|order_day|
+----------+-------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+-------------------+-----+------+-------

### Three categories based on the average value of orders

- Merchant Classification

Classification of shops according to their average sales level

In [10]:
from pyspark.ml.feature import QuantileDiscretizer

In [11]:
dqis = QuantileDiscretizer(numBuckets=3, inputCol='dollar_average', outputCol='dollar_stage', relativeError=0.001)

In [12]:
model = dqis.fit(merchant_fraud_sdf)

In [13]:
merchant_fraud_sdf = model.transform(merchant_fraud_sdf)

In [14]:
merchant_fraud_sdf.show()

+------------+---------+--------+---------------+-----+------------------+------------------+------------------+----------+-----------+---------+--------------------+------------------+------------+
|merchant_abn|take_rate|tagIndex|        tag_vec|count|    dollar_average|        dollar_min|        dollar_max|order_year|order_month|order_day|            features| fraud_probability|dollar_stage|
+------------+---------+--------+---------------+-----+------------------+------------------+------------------+----------+-----------+---------+--------------------+------------------+------------+
| 43719937438|     5.01|     4.0| (24,[4],[1.0])|   11| 76.74733459026552|27.621111402431843| 167.7734627758464|      2021|         11|       26|(34,[0,1,2,7,27,2...| 93.31059770507272|         0.0|
| 72553304202|     5.04|     4.0| (24,[4],[1.0])|   30|53.323260095837426| 11.07002991392789|146.32500496249799|      2021|         11|       26|(34,[0,1,2,7,27,2...| 88.55146900735417|         0.0|
| 286

- Four categories of merchants based on number of purchases

In [15]:
dqis2 = QuantileDiscretizer(numBuckets=4, inputCol='count', outputCol='count_stage', relativeError=0.001)

In [16]:
model2 = dqis2.fit(merchant_fraud_sdf)

In [18]:
merchant_fraud_sdf = model2.transform(merchant_fraud_sdf)

In [19]:
merchant_fraud_sdf.show()

+------------+---------+--------+---------------+-----+------------------+------------------+------------------+----------+-----------+---------+--------------------+------------------+------------+-----------+
|merchant_abn|take_rate|tagIndex|        tag_vec|count|    dollar_average|        dollar_min|        dollar_max|order_year|order_month|order_day|            features| fraud_probability|dollar_stage|count_stage|
+------------+---------+--------+---------------+-----+------------------+------------------+------------------+----------+-----------+---------+--------------------+------------------+------------+-----------+
| 43719937438|     5.01|     4.0| (24,[4],[1.0])|   11| 76.74733459026552|27.621111402431843| 167.7734627758464|      2021|         11|       26|(34,[0,1,2,7,27,2...| 93.31059770507272|         0.0|        3.0|
| 72553304202|     5.04|     4.0| (24,[4],[1.0])|   30|53.323260095837426| 11.07002991392789|146.32500496249799|      2021|         11|       26|(34,[0,1,2,

- Join with transaction data

In [20]:
consumer_fraud = consumer_fraud_sdf.select('user_id', 'order_year', 'order_month', 'order_day', 'fraud_probability')

In [21]:
consumer_fraud = consumer_fraud.withColumnRenamed('fraud_probability', 'consumer_fraud_probability')

In [22]:
consumer_fraud.show(5)

+-------+----------+-----------+---------+--------------------------+
|user_id|order_year|order_month|order_day|consumer_fraud_probability|
+-------+----------+-----------+---------+--------------------------+
|   9010|      2021|         11|       26|         11.53696660751666|
|   8942|      2021|         11|       26|         11.56013830129541|
|  11795|      2021|         11|       26|        13.605123123049225|
|  10548|      2021|         11|       26|         13.67580908072864|
|  10120|      2021|         11|       26|        13.930895430112514|
+-------+----------+-----------+---------+--------------------------+
only showing top 5 rows



In [23]:
sdf = trans_df.join(consumer_fraud, on=['user_id', 'order_year', 'order_month', 'order_day'], how = 'left')

In [24]:
sdf.show()

[Stage 17:>                                                         (0 + 8) / 9]

+-------+----------+-----------+---------+----------+------------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+--------------------+-----+------+-----------------+---------------+----------+------------+-----------------+------------------+-------------------+-----------+---------+--------+---------------+--------------------------+
|user_id|order_year|order_month|order_day|POA_CODE21|merchant_abn|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|            name|             address|state|gender|       population|New cases / day|     month|total_retail|__index_level_0__|          latitude|          longitude|genderIndex|typeIndex|tagIndex|        tag_vec|consumer_fraud_probability|
+-------+----------+-----------+---------+----------+------------+------------------+--------------------+--------------------+--------------------+--

                                                                                

In [25]:
merchant_fraud = merchant_fraud_sdf.select('merchant_abn', 'order_year', 'order_month', 'order_day', 'dollar_stage','count_stage','count','fraud_probability')

In [26]:
merchant_fraud = merchant_fraud.withColumnRenamed('fraud_probability', 'merchant_fraud_probability')

In [27]:
merchant_fraud.show(5)

+------------+----------+-----------+---------+------------+-----------+-----+--------------------------+
|merchant_abn|order_year|order_month|order_day|dollar_stage|count_stage|count|merchant_fraud_probability|
+------------+----------+-----------+---------+------------+-----------+-----+--------------------------+
| 43719937438|      2021|         11|       26|         0.0|        3.0|   11|         93.31059770507272|
| 72553304202|      2021|         11|       26|         0.0|        3.0|   30|         88.55146900735417|
| 28690231799|      2021|         11|       26|         1.0|        3.0|   24|         50.78452165739873|
| 45785438987|      2021|         11|       26|         2.0|        2.0|    4|         44.42391595533991|
| 41485392864|      2021|         11|       26|         1.0|        2.0|    3|         61.59168677152047|
+------------+----------+-----------+---------+------------+-----------+-----+--------------------------+
only showing top 5 rows



In [28]:
sdf = sdf.join(merchant_fraud, on=['merchant_abn', 'order_year', 'order_month', 'order_day'], how = 'left')

In [29]:
sdf.show()



+------------+----------+-----------+---------+-------+----------+------------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+----------------+--------------------+-----+------+-----------------+---------------+----------+------------+-----------------+------------------+-------------------+-----------+---------+--------+---------------+--------------------------+------------+-----------+-----+--------------------------+
|merchant_abn|order_year|order_month|order_day|user_id|POA_CODE21|      dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|            name|             address|state|gender|       population|New cases / day|     month|total_retail|__index_level_0__|          latitude|          longitude|genderIndex|typeIndex|tagIndex|        tag_vec|consumer_fraud_probability|dollar_stage|count_stage|count|merchant_fraud_probability|
+------------+----------+---------

                                                                                

- Rank Model

In [30]:
sdf = sdf.withColumn('average_probability', ((sdf.consumer_fraud_probability + sdf.merchant_fraud_probability)/2))

In [30]:
# sdf = sdf.filter(sdf.average_probability <= 80)

In [31]:
sdf.show(5)

[Stage 43:>                                                         (0 + 8) / 9]

+------------+----------+-----------+---------+-------+----------+-----------------+--------------------+--------------------+--------------------+---------+----+--------+-----------+---------------+--------------------+-----+------+-----------------+---------------+----------+------------+-----------------+------------------+------------------+-----------+---------+--------+--------------+--------------------------+------------+-----------+-----+--------------------------+-------------------+
|merchant_abn|order_year|order_month|order_day|user_id|POA_CODE21|     dollar_value|            order_id|       merchant_name|                tags|take_rate|type|postcode|consumer_id|           name|             address|state|gender|       population|New cases / day|     month|total_retail|__index_level_0__|          latitude|         longitude|genderIndex|typeIndex|tagIndex|       tag_vec|consumer_fraud_probability|dollar_stage|count_stage|count|merchant_fraud_probability|average_probability|
+-

                                                                                

- Top10 by dollar_stage

In [32]:
merchant_group = sdf.groupBy('merchant_abn','merchant_name', 'tags', 'take_rate', 'type', 'dollar_stage', 'count_stage').agg(F.avg("merchant_fraud_probability").alias("fraud_on_mercants"))

In [33]:
trans_sdf = read_file(spark, 'changed_data', '../data/curated/')

|> Loading File...
|> Loading Finished!
-RECORD 0---------------------------------------------------------------------------------------------
 POA_CODE21        | 2040                                                                             
 user_id           | 11893                                                                            
 merchant_abn      | 90578415511                                                                      
 dollar_value      | 131.14483377730215                                                               
 order_id          | 1db43d10-b5f2-4afc-b94a-948cfa1879f1                                             
 merchant_name     | A Scelerisque Foundation                                                         
 tags              | furniture, home furnishings and equipment shops, and manufacturers, except ap... 
 take_rate         | 5.95                                                                             
 type              | a           

In [35]:
merchant_group

[Stage 62:>                                                         (0 + 8) / 9]

22/10/08 22:33:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:32 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:33:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 62:>                                                         (0 + 8) / 9]

22/10/08 22:34:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:34:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:34:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [34]:
tag1 = merchant_group.filter(merchant_group.dollar_stage == 0.0).orderBy('fraud_on_merchants', ascending=False)

AnalysisException: Column 'fraud_on_merchants' does not exist. Did you mean one of the following? [fraud_on_mercants, merchant_abn, take_rate, merchant_name, count_stage, dollar_stage, tags, type];
'Sort ['fraud_on_merchants DESC NULLS LAST], true
+- Filter (dollar_stage#680 = 0.0)
   +- Aggregate [merchant_abn#195L, merchant_name#198, tags#199, take_rate#200, type#201, dollar_stage#680, count_stage#873], [merchant_abn#195L, merchant_name#198, tags#199, take_rate#200, type#201, dollar_stage#680, count_stage#873, avg(merchant_fraud_probability#1454) AS fraud_on_mercants#2474]
      +- Project [merchant_abn#195L, order_year#384, order_month#413, order_day#443, user_id#194L, POA_CODE21#193L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, ... 11 more fields]
         +- Project [merchant_abn#195L, order_year#384, order_month#413, order_day#443, user_id#194L, POA_CODE21#193L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, ... 10 more fields]
            +- Join LeftOuter, ((((merchant_abn#195L = merchant_abn#100L) AND (order_year#384 = order_year#108)) AND (order_month#413 = order_month#109)) AND (order_day#443 = order_day#110))
               :- Project [user_id#194L, order_year#384, order_month#413, order_day#443, POA_CODE21#193L, merchant_abn#195L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, ... 6 more fields]
               :  +- Join LeftOuter, ((((user_id#194L = user_id#0L) AND (order_year#384 = order_year#9)) AND (order_month#413 = order_month#10)) AND (order_day#443 = order_day#11))
               :     :- Project [POA_CODE21#193L, user_id#194L, merchant_abn#195L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, longitude#214, genderIndex#215, typeIndex#216, ... 5 more fields]
               :     :  +- Project [POA_CODE21#193L, user_id#194L, merchant_abn#195L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, longitude#214, genderIndex#215, typeIndex#216, ... 6 more fields]
               :     :     +- Project [POA_CODE21#193L, user_id#194L, merchant_abn#195L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, longitude#214, genderIndex#215, typeIndex#216, ... 5 more fields]
               :     :        +- Project [POA_CODE21#193L, user_id#194L, merchant_abn#195L, dollar_value#196, order_id#197, merchant_name#198, tags#199, take_rate#200, type#201, postcode#202, consumer_id#203, name#204, address#205, state#206, gender#207, population#208, New cases / day#209, month#210, total_retail#211, __index_level_0__#212L, latitude#213, longitude#214, genderIndex#215, typeIndex#216, ... 4 more fields]
               :     :           +- Relation [POA_CODE21#193L,user_id#194L,merchant_abn#195L,dollar_value#196,order_id#197,merchant_name#198,tags#199,take_rate#200,type#201,postcode#202,consumer_id#203,name#204,address#205,state#206,gender#207,population#208,New cases / day#209,month#210,total_retail#211,__index_level_0__#212L,latitude#213,longitude#214,genderIndex#215,typeIndex#216,... 3 more fields] parquet
               :     +- Project [user_id#0L, order_year#9, order_month#10, order_day#11, fraud_probability#13 AS consumer_fraud_probability#971]
               :        +- Project [user_id#0L, order_year#9, order_month#10, order_day#11, fraud_probability#13]
               :           +- Relation [user_id#0L,consumer_id#1,take_rate#2,tagIndex#3,tag_vec#4,count#5L,dollar_average#6,dollar_min#7,dollar_max#8,order_year#9,order_month#10,order_day#11,features#12,fraud_probability#13] parquet
               +- Project [merchant_abn#100L, order_year#108, order_month#109, order_day#110, dollar_stage#680, count_stage#873, count#104L, fraud_probability#112 AS merchant_fraud_probability#1454]
                  +- Project [merchant_abn#100L, order_year#108, order_month#109, order_day#110, dollar_stage#680, count_stage#873, count#104L, fraud_probability#112]
                     +- Project [merchant_abn#100L, take_rate#101, tagIndex#102, tag_vec#103, count#104L, dollar_average#105, dollar_min#106, dollar_max#107, order_year#108, order_month#109, order_day#110, features#111, fraud_probability#112, dollar_stage#680, if (isnull(cast(count#104L as double))) null else bucketizer_0(knownnotnull(cast(count#104L as double))) AS count_stage#873]
                        +- Project [merchant_abn#100L, take_rate#101, tagIndex#102, tag_vec#103, count#104L, dollar_average#105, dollar_min#106, dollar_max#107, order_year#108, order_month#109, order_day#110, features#111, fraud_probability#112, if (isnull(cast(dollar_average#105 as double))) null else bucketizer_0(knownnotnull(cast(dollar_average#105 as double))) AS dollar_stage#680]
                           +- Relation [merchant_abn#100L,take_rate#101,tagIndex#102,tag_vec#103,count#104L,dollar_average#105,dollar_min#106,dollar_max#107,order_year#108,order_month#109,order_day#110,features#111,fraud_probability#112] parquet


In [None]:
tag1.show(10)



+------------+--------------------+--------------------+---------+----+------------+-----------+------------------+
|merchant_abn|       merchant_name|                tags|take_rate|type|dollar_stage|count_stage| fraud_on_mercants|
+------------+--------------------+--------------------+---------+----+------------+-----------+------------------+
| 17294075449|In Faucibus Incor...|cable, satellite,...|     0.67|   d|         0.0|        1.0|105.41331838258702|
| 31322181845| Magna Sed Institute|cable, satellite,...|     0.31|   e|         0.0|        2.0|105.32404615904191|
| 31322181845| Magna Sed Institute|cable, satellite,...|     0.31|   e|         0.0|        1.0|104.89861440023932|
| 19237425345|A Scelerisque Ass...|cable, satellite,...|     2.04|   c|         0.0|        2.0|104.65614738864649|
| 21793603759|In Tincidunt Cong...|cable, satellite,...|     1.83|   c|         0.0|        2.0|103.37762567989265|
| 21793603759|In Tincidunt Cong...|cable, satellite,...|     1.83|   c| 

                                                                                

In [None]:
tag2 = merchant_group.filter(merchant_group.dollar_stage == 1.0).orderBy('fraud_on_mercants', ascending=False)

In [None]:
tag2.show(10)



+------------+--------------------+--------------------+---------+----+------------+-----------+------------------+
|merchant_abn|       merchant_name|                tags|take_rate|type|dollar_stage|count_stage| fraud_on_mercants|
+------------+--------------------+--------------------+---------+----+------------+-----------+------------------+
| 10142254217|Arcu Ac Orci Corp...|cable, satellite,...|     4.22|   b|         1.0|        1.0|106.67822921258357|
| 31322181845| Magna Sed Institute|cable, satellite,...|     0.31|   e|         1.0|        3.0|105.35905181181576|
| 31322181845| Magna Sed Institute|cable, satellite,...|     0.31|   e|         1.0|        1.0|105.30183328119115|
| 17294075449|In Faucibus Incor...|cable, satellite,...|     0.67|   d|         1.0|        1.0|105.28542846254946|
| 31322181845| Magna Sed Institute|cable, satellite,...|     0.31|   e|         1.0|        2.0| 105.0386532895941|
| 50885695263|   Euismod Institute|cable, satellite,...|     2.84|   c| 

                                                                                

In [None]:
tag3 = merchant_group.filter(merchant_group.dollar_stage == 2.0).orderBy('fraud_on_mercants', ascending=False)

In [None]:
tag3.show(10)



+------------+--------------------+--------------------+---------+----+------------+-----------+------------------+
|merchant_abn|       merchant_name|                tags|take_rate|type|dollar_stage|count_stage| fraud_on_mercants|
+------------+--------------------+--------------------+---------+----+------------+-----------+------------------+
| 17294075449|In Faucibus Incor...|cable, satellite,...|     0.67|   d|         2.0|        1.0|105.60961301636416|
| 21793603759|In Tincidunt Cong...|cable, satellite,...|     1.83|   c|         2.0|        1.0|104.42588236083975|
| 26974881830|Tellus Id Nunc As...|cable, satellite,...|     6.03|   a|         2.0|        1.0|101.03980424832844|
| 41974958954|Sed Libero Proin ...|cable, satellite,...|     5.51|   a|         2.0|        3.0|100.78551034236443|
| 28124275236|Vehicula Risus In...|cable, satellite,...|     2.69|   c|         2.0|        1.0|100.73686347749754|
| 67609108741|Metus Sit Amet In...|cable, satellite,...|     0.38|   e| 

                                                                                

## Three categories based on life cycle

In [37]:
merchant_fraud_sdf

                                                                                

merchant_abn,take_rate,tagIndex,tag_vec,count,dollar_average,dollar_min,dollar_max,order_year,order_month,order_day,features,fraud_probability,dollar_stage,count_stage
43719937438,5.01,4.0,"(24,[4],[1.0])",11,76.74733459026552,27.621111402431843,167.7734627758464,2021,11,26,"(34,[0,1,2,7,27,2...",93.31059770507272,0.0,3.0
72553304202,5.04,4.0,"(24,[4],[1.0])",30,53.32326009583743,11.07002991392789,146.325004962498,2021,11,26,"(34,[0,1,2,7,27,2...",88.55146900735417,0.0,3.0
28690231799,4.31,10.0,"(24,[10],[1.0])",24,251.4879766401084,72.67931938605861,572.2299960197754,2021,11,26,"(34,[0,1,2,13,27,...",50.78452165739873,1.0,3.0
45785438987,4.95,0.0,"(24,[0],[1.0])",4,559.1587888969477,11.844579423870604,1395.367630362804,2021,11,26,"(34,[0,1,3,27,28,...",44.42391595533991,2.0,2.0
41485392864,4.74,5.0,"(24,[5],[1.0])",3,267.7757428151986,216.53958541358767,323.0162848532207,2021,11,26,"(34,[0,1,2,8,27,2...",61.59168677152047,1.0,2.0
64435268766,2.89,18.0,"(24,[18],[1.0])",1,896.6366960482907,896.6366960482907,896.6366960482907,2021,11,26,"(34,[0,1,2,21,27,...",49.59054877724702,2.0,1.0
96885194635,3.8,13.0,"(24,[13],[1.0])",2,2277.3445783894954,898.7182159051191,3655.9709408738713,2021,11,26,"(34,[0,1,2,16,27,...",37.93870840824093,2.0,1.0
35883675055,2.3,14.0,"(24,[14],[1.0])",3,44.64095083441862,16.17910486324135,72.66649955345133,2021,11,26,"(34,[0,1,2,17,27,...",39.82450035140937,0.0,2.0
66527348775,2.34,5.0,"(24,[5],[1.0])",1,126.0920502701329,126.0920502701329,126.0920502701329,2021,11,26,"(34,[0,1,2,8,27,2...",63.960047859553015,1.0,1.0
67920961191,1.66,22.0,"(24,[22],[1.0])",1,1329.3659634700696,1329.3659634700696,1329.3659634700696,2021,11,26,"(34,[0,1,2,25,27,...",38.2363400551767,2.0,1.0


In [38]:
No_clear_periodicity = ['antique shops - sales, repairs, and restoration services',
                         'bicycle shops - sales and service',
                         'equipment, tool, furniture, and appliance rent al and leasing',
                         'watch, clock, and jewelry repair shops',
                         'gift, card, novelty, and souvenir shops',
                         'jewelry, watch, clock, and silverware shops',
                         'shoe shops',
                         'health and beauty spas',
                         'hobby, toy and game shops',
                         'digital goods: books, movies, music'
                         ]

In [39]:
len(No_clear_periodicity)

10

In [40]:
once_used = [
    'art dealers and galleries',
    'artist supply and craft shops',
    'computer programming , data processing, and integrated systems design services',
    'computers, computer peripheral equipment, and software',
    'motor vehicle supplies and new parts',
    'furniture, home furnishings and equipment shops, and manufacturers, except appliances',
    'music shops - musical instruments, pianos, and sheet music',
    'tent and awning shops'
]

In [41]:
len(once_used)

8

In [42]:
periodicity_product = [
    'stationery, office supplies and printing and writing paper',
    'telecom',
    'cable, satellite, and other pay television and radio services',
    'florists supplies, nursery stock, and flowers',
    'lawn and garden supply outlets, including nurseries',
    'books, periodicals, and newspapers',
    'opticians, optical goods, and eyeglasses',
]

In [43]:
len(periodicity_product)

7

In [44]:
merchant_group.show(5)

[Stage 84:>                 (0 + 8) / 9][Stage 87:>                 (0 + 0) / 9]

22/10/08 22:25:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/10/08 22:25:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


[Stage 84:>                 (0 + 8) / 9][Stage 87:>                 (0 + 0) / 9]

22/10/08 22:26:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
merchant_group.count()

                                                                                

3998

In [None]:
merchant_group = sdf.groupBy('merchant_abn','merchant_name', 'tags', 'take_rate', 'type').agg(F.avg("merchant_fraud_probability").alias("fraud_on_mercants"))
merchant_group.count()

                                                                                

3998

In [None]:
merchant_group.show(5)



+------------+--------------------+--------------------+---------+----+------------------+
|merchant_abn|       merchant_name|                tags|take_rate|type| fraud_on_mercants|
+------------+--------------------+--------------------+---------+----+------------------+
| 29616684420|       Tellus Id LLC|watch, clock, and...|      3.5|   b| 44.26282853979993|
| 46352584904|Vitae Aliquet Ass...|digital goods: bo...|     1.78|   c| 54.10562795632487|
| 19054547079|Tincidunt Adipisc...|gift, card, novel...|     6.65|   a| 81.05020124840995|
| 81299585288|     Eu Dui Cum Inc.|digital goods: bo...|     3.61|   b|50.602334303724255|
| 64521628277|   Augue Sed Limited|bicycle shops - s...|     6.15|   a|44.730533762670895|
+------------+--------------------+--------------------+---------+----+------------------+
only showing top 5 rows



                                                                                

In [None]:
merchant_group_df = merchant_group.to_pandas_on_spark()



In [None]:
from collections import defaultdict
lifecycle_count = defaultdict()
for type in No_clear_periodicity:
    lifecycle_count[type] = 1
    
for type in once_used:
    lifecycle_count[type] = 2

for type in periodicity_product:
    lifecycle_count[type] = 0


In [None]:
merchant_group_df.iloc[:5]

                                                                                

Unnamed: 0,merchant_abn,merchant_name,tags,take_rate,type,fraud_on_mercants
0,29616684420,Tellus Id LLC,"watch, clock, and jewelry repair shops",3.5,b,44.262829
1,46352584904,Vitae Aliquet Associates,"digital goods: books, movies, music",1.78,c,54.105628
2,19054547079,Tincidunt Adipiscing Corp.,"gift, card, novelty, and souvenir shops",6.65,a,81.050201
3,81299585288,Eu Dui Cum Inc.,"digital goods: books, movies, music",3.61,b,50.602334
4,64521628277,Augue Sed Limited,bicycle shops - sales and service,6.15,a,44.730534


In [None]:
merchant_group_df['lifecycle_count'] = merchant_group_df.tags.apply(lambda x: lifecycle_count[x] )

                                                                                

In [None]:
merchant_group = merchant_group_df.to_spark()



In [None]:
tag1 = merchant_group.filter(merchant_group.lifecycle_count == 0).orderBy('fraud_on_mercants', ascending=False)

In [None]:
from pyspark.sql.window import Window
fraud_on_mercants = Window.partitionBy("lifecycle_count").orderBy("fraud_on_mercants")
rank = merchant_group.withColumn("rank",F.rank().over(fraud_on_mercants))

In [None]:
rank.show(10)

                                                                                

+------------+--------------------+--------------------+---------+----+------------------+---------------+----+
|merchant_abn|       merchant_name|                tags|take_rate|type| fraud_on_mercants|lifecycle_count|rank|
+------------+--------------------+--------------------+---------+----+------------------+---------------+----+
| 24852446429|      Erat Vitae LLP|florists supplies...|     2.94|   c|-7.267578198286914|              0|   1|
| 43186523025|Lorem Ipsum Sodal...|florists supplies...|     4.47|   b| 8.045411768246208|              0|   2|
| 46804135891|Suspendisse Dui C...|opticians, optica...|     2.93|   c| 10.24788927405117|              0|   3|
| 97089682451| Mus Aenean Eget LLC|             telecom|     6.83|   a| 27.34329330999805|              0|   4|
| 91426391836|      Varius Limited|             telecom|     6.58|   a|29.334257325282426|              0|   5|
| 96806981644|      In Corporation|             telecom|     6.23|   a| 29.61290125832602|              

In [None]:
rank.filter(F.col('lifecycle_count')==0)

                                                                                

merchant_abn,merchant_name,tags,take_rate,type,fraud_on_mercants,lifecycle_count,rank
24852446429,Erat Vitae LLP,florists supplies...,2.94,c,-7.267578198286914,0,1
43186523025,Lorem Ipsum Sodal...,florists supplies...,4.47,b,8.045411768246208,0,2
46804135891,Suspendisse Dui C...,"opticians, optica...",2.93,c,10.24788927405117,0,3
97089682451,Mus Aenean Eget LLC,telecom,6.83,a,27.34329330999805,0,4
91426391836,Varius Limited,telecom,6.58,a,29.334257325282422,0,5
96806981644,In Corporation,telecom,6.23,a,29.61290125832602,0,6
97860823526,Aliquet Metus Urn...,telecom,5.85,a,29.97564968883061,0,7
82368304209,Nec Incorporated,telecom,5.55,a,30.474856348953647,0,8
72897360319,Sapien Imperdiet ...,telecom,6.28,a,30.550482234486903,0,9
42355028515,Eu Inc.,lawn and garden s...,5.97,a,31.238013092503863,0,10


In [None]:
rank.filter(F.col('lifecycle_count')==1)

                                                                                

merchant_abn,merchant_name,tags,take_rate,type,fraud_on_mercants,lifecycle_count,rank
86578477987,Leo In Consulting,"watch, clock, and...",6.43,a,-46.73426275011947,1,1
45629217853,Lacus Consulting,"gift, card, novel...",6.98,a,9.204568935170464,1,2
98973094975,Ornare Fusce Inc.,"hobby, toy and ga...",5.98,a,12.953238621668094,1,3
68559320474,Aliquam Auctor As...,antique shops - s...,4.2,b,17.355013354031794,1,4
87998844202,A Corp.,antique shops - s...,6.57,a,19.106666762660375,1,5
73489866331,Eu Dui Cum Company,antique shops - s...,6.9,a,19.76165507714507,1,6
76626119831,Tristique Pellent...,antique shops - s...,1.99,c,20.37586655680667,1,7
86744504251,Suspendisse Dui LLC,"jewelry, watch, c...",6.64,a,20.421989203820477,1,8
72472909171,Nullam Consulting,digital goods: bo...,6.33,a,20.511967740872716,1,9
94446707800,Cursus Corporation,"equipment, tool, ...",6.53,a,21.124424947517998,1,10


In [None]:
rank.filter(F.col('lifecycle_count')==2)

                                                                                

merchant_abn,merchant_name,tags,take_rate,type,fraud_on_mercants,lifecycle_count,rank
49891706470,Non Vestibulum In...,tent and awning s...,5.8,a,-34.78629470499111,2,1
89726005175,Est Nunc Consulting,tent and awning s...,6.01,a,-24.368616317085767,2,2
64203420245,Pede Nonummy Corp.,tent and awning s...,2.86,c,-18.69064506927141,2,3
63290521567,Vehicula Pellente...,artist supply and...,6.48,a,-7.033836621608254,2,4
99785979138,Elit Curabitur LLP,art dealers and g...,6.82,a,-4.907763357943622,2,5
98166254020,Magna Sed Industries,art dealers and g...,5.96,a,-1.2703817774782171,2,6
90976587185,Risus Donec Corp.,art dealers and g...,6.3,a,-0.8467110611626494,2,7
99420575685,Facilisi Consulting,art dealers and g...,4.78,b,1.9664238427731109,2,8
95824231566,Consequat Inc.,art dealers and g...,4.37,b,2.0096201948499584,2,9
66069111675,Elementum At Ltd,art dealers and g...,6.1,a,2.2030004059117347,2,10
